import MeCab


#MeCab확인
text = '아버지가방에들어가신다'
tokenizer = MeCab.Tagger()
print(tokenizer.parse(text))

아버지	NNG,*,F,아버지,*,*,*,*
가	JKS,*,F,가,*,*,*,*
방	NNG,장소,T,방,*,*,*,*
에	JKB,*,F,에,*,*,*,*
들어가	VV,*,F,들어가,*,*,*,*
신다	EP+EC,*,F,신다,Inflect,EP,EC,시/EP/*+ㄴ다/EC/*
EOS


stop_words = ['나','여기','무엇','그것','가','긋','가본','제','저','저희','그거','브','우리','그','큐티','쏘','슈','어디','뭐','자기','놀자','이곳','임','요방','너','그곳','거기','니티','노','이쪽','저기','호텔','곳','점','앞','시','원분','스로','도','대','플','기','그제','넥','딩','놀']
#stop_words = ['호텔','신라','스테이']
len(stop_words)
def getNVM_lemma(text):
    tokenizer = MeCab.Tagger()
    parsed = tokenizer.parse(text)
    #print(parsed)
    word_tag = [w for w in parsed.split("\n")] 
    pos = []
    tags = ["NNG", "NNP","VV","VA","VCP",'VCN','XR']
   
    for word_ in word_tag[:-2]:
        word = word_.split('\t') #['아버지', 'NNG,*,F,아버지,*,*,*,*']
        tag = word[1].split(",")  #['EC', '*', 'F', '는다', '*', '*', '*', '*']
        if('+' in tag[0]): #단어가 여러 형태소로 구성된 경우
            if ('VV' in tag[0] or 'VA' in tag[0] or 'VX' in tag[0]):
                t = tag[-1].split('/')[0]
                if t not in stop_words:
                    pos.append(t)
        elif ((tag[0] in tags) and (word[0] not in stop_words)):
            pos.append(word[0])
    return pos


import konlpy
from konlpy.tag import Komoran

C:\Users\82104\Anaconda3\lib\site-packages\requests\__init__.py:80: RequestsDependencyWarning: urllib3 (1.25.11) or chardet (3.0.4) doesn't match a supported version!
  RequestsDependencyWarning)


s = "우리는 가까워질 수 없기 때문에 가깝게 느껴지지 않는다"
print(getNVM_lemma(s))

['가깝', '없', '가깝', '느끼']


s ="너랑 나는 가까운 사이였지만 이제는 가깝지 않아"
print(getNVM_lemma(s))

['가깝', '사이', '이제', '가깝']


import pandas as pd


review = pd.read_excel('df_all_1130_4.xlsx')
dict_site = {'아고다': 0,"야놀자":1}
review['site'] = review['site'].map(dict_site)
review.head()


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.layers import Dense, Activation

INFO:tensorflow:Using local port 19423
INFO:tensorflow:Using local port 19635
INFO:tensorflow:Using local port 17104
INFO:tensorflow:Using local port 15947
INFO:tensorflow:Using local port 24128
INFO:tensorflow:Using local port 18637
INFO:tensorflow:Using local port 23346
INFO:tensorflow:Using local port 24555
INFO:tensorflow:Using local port 18032
INFO:tensorflow:Using local port 23674

Using TensorFlow backend.


cv = CountVectorizer(min_df = 0, tokenizer = getNVM_lemma , stop_words = stop_words, preprocessor = None, lowercase = False,ngram_range = (1,2))


dtm = cv.fit_transform(review['review_spell_check'])


import joblib
import numpy as np


from sklearn.model_selection import train_test_split


x  = dtm.toarray()
y = review['helpful']


#length추가
review_len = np.array(review.length).reshape(-1,1)
x = np.hstack((x, review_len))


#site이름 추가

site_name = np.array(review.site).reshape(-1,1)
x = np.hstack((x, site_name))

x

array([[ 0,  0,  0, ..., 22, 22,  0],
       [ 0,  0,  0, ..., 12, 12,  0],
       [ 0,  0,  0, ..., 69, 69,  0],
       ...,
       [ 0,  0,  0, ..., 33, 33,  1],
       [ 0,  0,  0, ..., 59, 59,  1],
       [ 0,  0,  0, ..., 10, 10,  1]], dtype=int64)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


cv.tokenizer = None


joblib.dump((cv, x_train, x_test, y_train, y_test), 'hotel_lemma_cv_ver4.pkl')

['hotel_lemma_cv_ver4.pkl']


cv, x_train, x_test, y_train, y_test = joblib.load('hotel_lemma_cv_ver4.pkl')


x_train.shape

(1927, 15231)


x_test.shape

(482, 15231)


import tensorflow as tf


model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(1,  #y자체는 열이 하나
          input_shape = (15231, ), #input data는 shape 크기에 맞추기
          activation = 'sigmoid',
          kernel_regularizer = tf.keras.regularizers.l1_l2(0, 0.001) #ㅣ1은 0만, l2는 0.001만 : 이때가 제일 정확도 높음
         ))
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_13 (Dense)             (None, 1)                 15232     
=================================================================
Total params: 15,232
Trainable params: 15,232
Non-trainable params: 0
_________________________________________________________________


model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics = ['accuracy'])


model.fit(x_train, y_train.values, epochs=100, validation_split=0.1,
         callbacks=[tf.keras.callbacks.EarlyStopping()])

Epoch 1/100
55/55 [==============================] - 1s 10ms/step - loss: 0.7790 - accuracy: 0.5383 - val_loss: 0.6062 - val_accuracy: 0.6788
Epoch 2/100
55/55 [==============================] - 0s 7ms/step - loss: 0.5951 - accuracy: 0.7771 - val_loss: 0.5784 - val_accuracy: 0.8135
Epoch 3/100
55/55 [==============================] - 0s 6ms/step - loss: 0.5536 - accuracy: 0.8442 - val_loss: 0.5585 - val_accuracy: 0.8394
Epoch 4/100
55/55 [==============================] - 0s 6ms/step - loss: 0.5200 - accuracy: 0.8997 - val_loss: 0.5343 - val_accuracy: 0.8342
Epoch 5/100
55/55 [==============================] - 0s 6ms/step - loss: 0.4970 - accuracy: 0.8904 - val_loss: 0.5203 - val_accuracy: 0.8549
Epoch 6/100
55/55 [==============================] - 0s 6ms/step - loss: 0.4718 - accuracy: 0.9056 - val_loss: 0.5056 - val_accuracy: 0.8549
Epoch 7/100
55/55 [==============================] - 0s 6ms/step - loss: 0.4556 - accuracy: 0.9032 - val_loss: 0.4931 - val_accuracy: 0.8549
Epoch 8/100
55/55 [==============================] - 0s 6ms/step - loss: 0.4301 - accuracy: 0.9074 - val_loss: 0.4831 - val_accuracy: 0.8497
Epoch 9/100
55/55 [==============================] - 0s 6ms/step - loss: 0.4196 - accuracy: 0.9136 - val_loss: 0.4747 - val_accuracy: 0.8497
Epoch 10/100
55/55 [==============================] - 0s 6ms/step - loss: 0.4034 - accuracy: 0.9195 - val_loss: 0.4697 - val_accuracy: 0.8394
Epoch 11/100
55/55 [==============================] - 0s 6ms/step - loss: 0.3986 - accuracy: 0.9086 - val_loss: 0.4604 - val_accuracy: 0.8549
Epoch 12/100
55/55 [==============================] - 0s 6ms/step - loss: 0.3875 - accuracy: 0.9169 - val_loss: 0.4549 - val_accuracy: 0.8549
Epoch 13/100
55/55 [==============================] - 0s 6ms/step - loss: 0.3798 - accuracy: 0.9175 - val_loss: 0.4499 - val_accuracy: 0.8497
Epoch 14/100
55/55 [==============================] - 0s 6ms/step - loss: 0.3731 - accuracy: 0.9200 - val_loss: 0.4458 - val_accuracy: 0.8497
Epoch 15/100
55/55 [==============================] - 0s 7ms/step - loss: 0.3659 - accuracy: 0.9196 - val_loss: 0.4424 - val_accuracy: 0.8549
Epoch 16/100
55/55 [==============================] - 0s 6ms/step - loss: 0.3593 - accuracy: 0.9211 - val_loss: 0.4441 - val_accuracy: 0.8290

<tensorflow.python.keras.callbacks.History at 0x2b49a577278>


model.evaluate(x_test, y_test.values, verbose=0) #손실, accuracy

[0.433194637298584, 0.8340249061584473]


prob=model.predict(
    x_test,
    batch_size=None,
    verbose=0,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
)


from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score


predictional = np.where(prob >= 0.5, 1, 0)


confusion_matrix(y_test,predictional)

array([[265,  20],
       [ 60, 137]], dtype=int64)


TN,FP,FN,TP=confusion_matrix(y_test,predictional).ravel()
print(TN,FP,FN,TP)

265 20 60 137


print('정확도: ',accuracy_score(y_test,predictional))
print('정밀도: ',precision_score(y_test,predictional))

정확도:  0.8340248962655602
정밀도:  0.8726114649681529


prediction1 = np.where(prob >= 0.7,1,0)


confusion_matrix(y_test,prediction1)
TN,FP,FN,TP=confusion_matrix(y_test,prediction1).ravel()
print(TN,FP,FN,TP)

281 4 104 93


print('정확도: ',accuracy_score(y_test,prediction1))
print('정밀도: ',precision_score(y_test,prediction1))

정확도:  0.7759336099585062
정밀도:  0.9587628865979382

	hotel	score	review	date	star	length	review_spell_check	helpful	attitude	담당자	위치	시설	청결	방음
0	나인트리 프리미어 명동2	10.0	뷰 좋고 위치 좋고 깨끗하고 최고 입니다	NaN	NaN	22	뷰 좋고 위치 좋고 깨끗하고 최고입니다	1	2	주현	1	0	0	0
1	신라스테이 광화문	10.0	위치 시설 모두 좋아요	NaN	NaN	12	위치 시설 모두 좋아요	0	2	주현	1	1	0	0
2	신라스테이 광화문	2.0	침대에 빨래 먼지로 보이는 먼지가 이불침대 시트 모두에 한가득이었습니다 청소 상태...	NaN	NaN	69	침대에 빨래 먼지로 보이는 먼지가 이불 침대 시트 모두에 한가득이었습니다 청소 상...	1	0	주현	0	0	1	0
3	신라스테이 광화문	8.8	위치 시설 서비스 모두 다 만족합니다	NaN	NaN	20	위치 시설 서비스 모두 다 만족합니다	0	2	주현	1	1	0	0
4	신라스테이 광화문	8.0	주변에 식사장소도 많고 볼 곳도 많아서 좋습니다다만 주말에는 집회가 근처에서 많아 ...	NaN	NaN	57	주변에 식사 장소도 많고 볼 곳도 많아서 좋습니다 다만 주말에는 집회가 근처에서 많...	1	1	주현	1	0	0	1

Lemmatazation : 단어 원형 변환¶

Countvector TDM¶

Input Data¶

훈련 데이터, 테스트 데이터 나누기¶

Sequential모델¶

confusion matrix¶

문턱값 : 0.5¶

문턱값 : 0.7¶