import pandas as pd
import MeCab
import numpy as np
df = pd.read_excel('data/비교.xlsx')


df.shape

(2409, 16)


df['length']=df['review_spell_check'].apply(lambda x:len(str(x)))


df['length']=df['length'].astype(int)


df.loc[df['site'] == '아고다','agoda'] = 1
df.loc[df['site'] != '아고다','agoda'] = 0
df.loc[df['site'] == '야놀자','yanolja'] = 1
df.loc[df['site'] != '야놀자','yanolja'] = 0


df.head(5)


stop_words = ['나','여기','무엇','그것','가','긋','가본','제','저','저희','그거','브','우리','그','큐티','쏘','슈','어디','뭐','자기','놀자','이곳','임','요방','너','그곳','거기','니티','노','이쪽','저기','호텔','곳','점','앞','시','원분','스로','도','대','플','기','그제','넥','딩','놀']
#stop_words = []
len(stop_words)

46


def getNVM_lemma(text):
    tokenizer = MeCab.Tagger()
    parsed = tokenizer.parse(text)
    #print(parsed)
    word_tag = [w for w in parsed.split("\n")] 
    pos = []
    tags = ["NNG", "NNP","VV","VA","VCP",'VCN','XR']
   
    for word_ in word_tag[:-2]:
        word = word_.split('\t') #['아버지', 'NNG,*,F,아버지,*,*,*,*']
        tag = word[1].split(",")  #['EC', '*', 'F', '는다', '*', '*', '*', '*']
        if('+' in tag[0]): #단어가 여러 형태소로 구성된 경우
            if ('VV' in tag[0] or 'VA' in tag[0] or 'VX' in tag[0]):
                t = tag[-1].split('/')[0]
                if t not in stop_words:
                    pos.append(t)
        elif ((tag[0] in tags) and (word[0] not in stop_words)):
            pos.append(word[0])
    return pos


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


tfidf = TfidfVectorizer(ngram_range=(1,2),tokenizer = getNVM_lemma, preprocessor = None, lowercase = False)
cv = CountVectorizer(ngram_range=(1,2),tokenizer = getNVM_lemma, preprocessor = None, lowercase = False)
tdm_tfidf = tfidf.fit_transform(df['review_spell_check'].values.astype('U'))
tdm_cv= cv.fit_transform(df['review_spell_check'].values.astype('U'))
#LSA


from sklearn.model_selection import train_test_split


x_tf = tdm_tfidf.toarray()
x_tf_len = []
for i in range(len(x_tf)):
    a = x_tf[i]
    #print(a,len(a))
    a = np.append(a,np.log(df['length'][i]+1)) # 길이 컬럼 추가
    a = np.append(a,df['agoda'][i]) # 아고다 컬럼 추가
    a = np.append(a,df['yanolja'][i]) # 야놀자 컬럼 추가
    #a = np.append(a,np.log(df['length'][i]+1))
    #a = np.append(a, df['위치'][i])
    #a = np.append(a, df['시설'][i])
    #a = np.append(a, df['인테리어'][i])
    #a = np.append(a, df['청결'][i])
    #a = np.append(a, df['친절'][i])
    #a = np.append(a, df['방음'][i])
    #a = np.append(a, df['attitude'][i])
    #print(a,len(a))
    x_tf_len.append(a)


x_cv = tdm_cv.toarray() # TDM (countvector) 추가
x_cv_len = []
for i in range(len(x_cv)):
    a = x_cv[i]
    #print(a,len(a))
    a = np.append(a,np.log(df['length'][i]+1)) # 길이 컬럼 추가
    a = np.append(a,df['agoda'][i]) # 아고다 컬럼 추가
    a = np.append(a,df['yanolja'][i]) # 야놀자 컬럼 추가
    a = np.append(a, df['위치'][i])
    a = np.append(a, df['시설'][i])
    a = np.append(a, df['인테리어'][i])
    a = np.append(a, df['청결'][i])
    a = np.append(a, df['친절'][i])
    a = np.append(a, df['방음'][i])

    #print(a,len(a))
    x_cv_len.append(a)


len(x_cv_len[0])

2553


y = df['helpful']


# len 추가
#x1_train, x1_test, y1_train, y1_test = train_test_split(x_tfidf_len, y, test_size=0.2, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_cv_len, y, test_size=0.2, random_state=42)
x2_train, x2_test, y2_train, y2_test = train_test_split(x_tf_len, y, test_size=0.2, random_state=42)


# 정규화 한게 정확도 더 낮음 -> 그래서 안씀
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x2_train_std = sc.fit_transform(x2_train)
x2_test_std = sc.fit_transform(x2_test)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


# cv가 더 높음
clf2 = LogisticRegression(random_state = 42 
                          , C=0.1
                          , max_iter = 100
                         ).fit(x2_train,y2_train)
pred2 = clf2.predict(x_test)
accuracy_score(y2_test, pred2)

0.8402489626556017


# cv가 더 높음
clf = LogisticRegression(random_state = 42 
                          , C=0.1
                          , max_iter = 100
                         ).fit(x_train,y_train)
pred = clf.predict(x_test)
accuracy_score(y_test, pred)

0.8568464730290456


proba = clf.predict_proba(x_test)
print(len(proba))
print(len(proba[0]))

482
2


p_list = []
for p in proba:
    p_list.append(p[1])
p_list=np.round(p_list,2)
data=pd.DataFrame({
    '예측':p_list,
    '실제':y_test})
data.head()


# 예측이랑 전체 데이터랑 merge
#data.merge(df, left_index=True, right_index=True, how='left').to_excel('data/scatterplot.xlsx',index = False)


# 전체 데이터랑 붙여주기 
result = data.merge(df, left_index=True, right_index=True, how='left')
result.head()


top50=result.sort_values(['예측'],ascending=False).head(50)


result[['위치','시설','인테리어','청결','친절','방음']].agg(['sum','count'])


50/482

0.1037344398340249


# confusion matrix
confusion_matrix(y_test,pred)

array([[258,  27],
       [ 42, 155]])


TN,FP,FN,TP=confusion_matrix(y_test,pred).ravel()
print(TN,FP,FN,TP)

258 27 42 155


print('정확도: ',accuracy_score(y_test,pred))
print('정밀도: ',precision_score(y_test,pred))

정확도:  0.8568464730290456
정밀도:  0.8516483516483516


# 직접 계산
print((TN + TP) / (TN +FN +FP +TP))
# 함수 계산
print(accuracy_score(y_test,pred))

0.8526970954356846
0.8526970954356846


# 직접 계산
print(TP /(TP+FP))
# 함수 계산
print(precision_score(y_test,pred))

0.85
0.85


prediction1 = np.where(proba >= 0.7,1,0)
pred1_7 = []
for i in prediction1:
    pred1_7.append(i[1])


# confusion matrix
confusion_matrix(y_test,pred1_7)

array([[275,  10],
       [ 70, 127]])


TN,FP,FN,TP=confusion_matrix(y_test,pred1_7).ravel()
print(TN,FP,FN,TP)

275 10 70 127


print('정확도: ',accuracy_score(y_test,pred1_7))
print('정밀도: ',precision_score(y_test,pred1_7))

정확도:  0.8340248962655602
정밀도:  0.927007299270073


kfold = KFold(n_splits=10, shuffle=True)
model =  LogisticRegression(random_state=42, C=0.1)
scores = cross_val_score(model,x_train,y_train, cv=kfold)
print(scores)
print(scores.mean())

[0.87564767 0.8134715  0.88601036 0.84455959 0.84974093 0.84455959
 0.80829016 0.89583333 0.86979167 0.84895833]
0.8536863126079448


from sklearn.metrics import f1_score


f1_score(y_test, pred)

0.8191489361702128


f1_score(y_test, pred1_7)

0.6283783783783785

	site	hotel	score	review	date	star	length	review_spell_check	helpful	attitude	위치	시설	청결	방음	agoda
0	아고다	나인트리 프리미어 명동2	10.0	뷰 좋고 위치 좋고 깨끗하고 최고 입니다	NaN	NaN	21	뷰 좋고 위치 좋고 깨끗하고 최고입니다	1	2	1	0	0	0	1.0
1	아고다	신라스테이 광화문	10.0	위치 시설 모두 좋아요	NaN	NaN	12	위치 시설 모두 좋아요	0	2	1	1	0	0	1.0
2	아고다	신라스테이 광화문	2.0	침대에 빨래 먼지로 보이는 먼지가 이불침대 시트 모두에 한가득이었습니다 청소 상태...	NaN	NaN	68	침대에 빨래 먼지로 보이는 먼지가 이불 침대 시트 모두에 한가득이었습니다 청소 상...	1	0	0	0	1	0	1.0
3	아고다	신라스테이 광화문	8.8	위치 시설 서비스 모두 다 만족합니다	NaN	NaN	20	위치 시설 서비스 모두 다 만족합니다	0	2	1	1	0	0	1.0
4	아고다	신라스테이 광화문	8.0	주변에 식사장소도 많고 볼 곳도 많아서 좋습니다다만 주말에는 집회가 근처에서 많아 ...	NaN	NaN	60	주변에 식사 장소도 많고 볼 곳도 많아서 좋습니다 다만 주말에는 집회가 근처에서 많...	1	1	1	0	0	1	1.0

	예측	실제
2009	0.69	1
2082	0.17	0
1684	0.30	1
2301	0.63	0
792	1.00	1

	예측	실제	site	hotel	score	review	date	star	length	review_spell_check	helpful	attitude	위치	시설	청결	친절	방음	yanolja
2009	0.69	1	야놀자	신라스테이 광화문	10.0	위치적으로 접근하기 좋았고 청결도와 직원 서비스가 깔끔해서 다시 가도 좋겠다 생각했습니다	2020. 04. 01	5.0	50	위치적으로 접근하기 좋았고 청결 도와 직원 서비스가 깔끔해서 다시 가도 좋겠다 생각...	1	2	1	0	1	1	0	1.0
2082	0.17	0	야놀자	신라스테이 광화문	10.0	좋습니다 역시 호텔임	2020. 10. 05	5.0	11	좋습니다 역시 호텔임	0	2	0	0	0	0	0	1.0
1684	0.30	1	야놀자	롯데 호텔 서울	8.0	정말 좋았습니다 다만 지금 공사기간입니다 나머지는 더할나위 없었네요	2018. 08. 04	4.0	38	정말 좋았습니다 다만 지금 공사기간입니다 나머지는 더할 나위 없었네요	1	1	0	0	0	0	0	1.0
2301	0.63	0	야놀자	신라스테이 광화문	8.0	깨끗하고 편의시설도 주변에 많고 친절하고 좋았어요 다음이 또 이용하겠습니다	2018-08-24 00:00:00	4.0	41	깨끗하고 편의시설도 주변에 많고 친절하고 좋았어요 다음이 또 이용하겠습니다	0	2	0	1	1	1	0	1.0
792	1.00	1	야놀자	그랜드 워커힐 서울	8.0	말로만 듣던 5성급 모든시설과 서비스 등 최고 청결도나 방음 좋다 뷰는 리버뷰이긴 ...	2019. 09. 23	4.0	237	말로만 듣던 5성급 모든 시설과 서비스 등 최고 청결도 나 방음 좋다 뷰는 리버뷰이...	1	0	0	1	1	0	1	1.0

mecab 단어 원형 추출¶

tfidf, count Vecotr¶

훈련, 테스트 데이터 나누기¶

X (feature)¶

y (target)¶

train, test split¶

정규화¶

LogisticRegression¶

확률로 뽑기¶

실제 값과 예측 확률 비교¶

helpful 상위 50개 중 카테고리 빈도수¶

Confusion Matrix (성능 측정)¶

정확도¶

정밀도¶

정밀도 높이기!!!¶

정확도, 정밀도¶

교차검증해보기¶