import MeCab
#MeCab확인
text = '아버지가방에들어가신다'
tokenizer = MeCab.Tagger()
print(tokenizer.parse(text))
아버지 NNG,*,F,아버지,*,*,*,* 가 JKS,*,F,가,*,*,*,* 방 NNG,장소,T,방,*,*,*,* 에 JKB,*,F,에,*,*,*,* 들어가 VV,*,F,들어가,*,*,*,* 신다 EP+EC,*,F,신다,Inflect,EP,EC,시/EP/*+ㄴ다/EC/* EOS
stop_words = ['나','여기','무엇','그것','가','긋','가본','제','저','저희','그거','브','우리','그','큐티','쏘','슈','어디','뭐','자기','놀자','이곳','임','요방','너','그곳','거기','니티','노','이쪽','저기','호텔','곳','점','앞','시','원분','스로','도','대','플','기','그제','넥','딩','놀']
#stop_words = ['호텔','신라','스테이']
len(stop_words)
def getNVM_lemma(text):
tokenizer = MeCab.Tagger()
parsed = tokenizer.parse(text)
#print(parsed)
word_tag = [w for w in parsed.split("\n")]
pos = []
tags = ["NNG", "NNP","VV","VA","VCP",'VCN','XR']
for word_ in word_tag[:-2]:
word = word_.split('\t') #['아버지', 'NNG,*,F,아버지,*,*,*,*']
tag = word[1].split(",") #['EC', '*', 'F', '는다', '*', '*', '*', '*']
if('+' in tag[0]): #단어가 여러 형태소로 구성된 경우
if ('VV' in tag[0] or 'VA' in tag[0] or 'VX' in tag[0]):
t = tag[-1].split('/')[0]
if t not in stop_words:
pos.append(t)
elif ((tag[0] in tags) and (word[0] not in stop_words)):
pos.append(word[0])
return pos
import konlpy
from konlpy.tag import Komoran
C:\Users\82104\Anaconda3\lib\site-packages\requests\__init__.py:80: RequestsDependencyWarning: urllib3 (1.25.11) or chardet (3.0.4) doesn't match a supported version! RequestsDependencyWarning)
s = "우리는 가까워질 수 없기 때문에 가깝게 느껴지지 않는다"
print(getNVM_lemma(s))
['가깝', '없', '가깝', '느끼']
s ="너랑 나는 가까운 사이였지만 이제는 가깝지 않아"
print(getNVM_lemma(s))
['가깝', '사이', '이제', '가깝']
import pandas as pd
review = pd.read_excel('df_all_1130_4.xlsx')
dict_site = {'아고다': 0,"야놀자":1}
review['site'] = review['site'].map(dict_site)
review.head()
site | hotel | score | review | date | star | length | review_spell_check | helpful | attitude | 담당자 | 위치 | 시설 | 인테리어 | 청결 | 친절 | 방음 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 나인트리 프리미어 명동2 | 10.0 | 뷰 좋고 위치 좋고 깨끗하고 최고 입니다 | NaN | NaN | 22 | 뷰 좋고 위치 좋고 깨끗하고 최고입니다 | 1 | 2 | 주현 | 1 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 신라스테이 광화문 | 10.0 | 위치 시설 모두 좋아요 | NaN | NaN | 12 | 위치 시설 모두 좋아요 | 0 | 2 | 주현 | 1 | 1 | 0 | 0 | 0 | 0 |
2 | 0 | 신라스테이 광화문 | 2.0 | 침대에 빨래 먼지로 보이는 먼지가 이불침대 시트 모두에 한가득이었습니다 청소 상태... | NaN | NaN | 69 | 침대에 빨래 먼지로 보이는 먼지가 이불 침대 시트 모두에 한가득이었습니다 청소 상... | 1 | 0 | 주현 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 0 | 신라스테이 광화문 | 8.8 | 위치 시설 서비스 모두 다 만족합니다 | NaN | NaN | 20 | 위치 시설 서비스 모두 다 만족합니다 | 0 | 2 | 주현 | 1 | 1 | 0 | 0 | 0 | 0 |
4 | 0 | 신라스테이 광화문 | 8.0 | 주변에 식사장소도 많고 볼 곳도 많아서 좋습니다다만 주말에는 집회가 근처에서 많아 ... | NaN | NaN | 57 | 주변에 식사 장소도 많고 볼 곳도 많아서 좋습니다 다만 주말에는 집회가 근처에서 많... | 1 | 1 | 주현 | 1 | 0 | 0 | 0 | 0 | 1 |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.layers import Dense, Activation
INFO:tensorflow:Using local port 19423 INFO:tensorflow:Using local port 19635 INFO:tensorflow:Using local port 17104 INFO:tensorflow:Using local port 15947 INFO:tensorflow:Using local port 24128 INFO:tensorflow:Using local port 18637 INFO:tensorflow:Using local port 23346 INFO:tensorflow:Using local port 24555 INFO:tensorflow:Using local port 18032 INFO:tensorflow:Using local port 23674
Using TensorFlow backend.
cv = CountVectorizer(min_df = 0, tokenizer = getNVM_lemma , stop_words = stop_words, preprocessor = None, lowercase = False,ngram_range = (1,2))
dtm = cv.fit_transform(review['review_spell_check'])
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
x = dtm.toarray()
y = review['helpful']
#length추가
review_len = np.array(review.length).reshape(-1,1)
x = np.hstack((x, review_len))
#site이름 추가
site_name = np.array(review.site).reshape(-1,1)
x = np.hstack((x, site_name))
x
array([[ 0, 0, 0, ..., 22, 22, 0], [ 0, 0, 0, ..., 12, 12, 0], [ 0, 0, 0, ..., 69, 69, 0], ..., [ 0, 0, 0, ..., 33, 33, 1], [ 0, 0, 0, ..., 59, 59, 1], [ 0, 0, 0, ..., 10, 10, 1]], dtype=int64)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
cv.tokenizer = None
joblib.dump((cv, x_train, x_test, y_train, y_test), 'hotel_lemma_cv_ver4.pkl')
['hotel_lemma_cv_ver4.pkl']
cv, x_train, x_test, y_train, y_test = joblib.load('hotel_lemma_cv_ver4.pkl')
x_train.shape
(1927, 15231)
x_test.shape
(482, 15231)
import tensorflow as tf
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(1, #y자체는 열이 하나
input_shape = (15231, ), #input data는 shape 크기에 맞추기
activation = 'sigmoid',
kernel_regularizer = tf.keras.regularizers.l1_l2(0, 0.001) #ㅣ1은 0만, l2는 0.001만 : 이때가 제일 정확도 높음
))
model.summary()
Model: "sequential_13" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_13 (Dense) (None, 1) 15232 ================================================================= Total params: 15,232 Trainable params: 15,232 Non-trainable params: 0 _________________________________________________________________
model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics = ['accuracy'])
model.fit(x_train, y_train.values, epochs=100, validation_split=0.1,
callbacks=[tf.keras.callbacks.EarlyStopping()])
Epoch 1/100 55/55 [==============================] - 1s 10ms/step - loss: 0.7790 - accuracy: 0.5383 - val_loss: 0.6062 - val_accuracy: 0.6788 Epoch 2/100 55/55 [==============================] - 0s 7ms/step - loss: 0.5951 - accuracy: 0.7771 - val_loss: 0.5784 - val_accuracy: 0.8135 Epoch 3/100 55/55 [==============================] - 0s 6ms/step - loss: 0.5536 - accuracy: 0.8442 - val_loss: 0.5585 - val_accuracy: 0.8394 Epoch 4/100 55/55 [==============================] - 0s 6ms/step - loss: 0.5200 - accuracy: 0.8997 - val_loss: 0.5343 - val_accuracy: 0.8342 Epoch 5/100 55/55 [==============================] - 0s 6ms/step - loss: 0.4970 - accuracy: 0.8904 - val_loss: 0.5203 - val_accuracy: 0.8549 Epoch 6/100 55/55 [==============================] - 0s 6ms/step - loss: 0.4718 - accuracy: 0.9056 - val_loss: 0.5056 - val_accuracy: 0.8549 Epoch 7/100 55/55 [==============================] - 0s 6ms/step - loss: 0.4556 - accuracy: 0.9032 - val_loss: 0.4931 - val_accuracy: 0.8549 Epoch 8/100 55/55 [==============================] - 0s 6ms/step - loss: 0.4301 - accuracy: 0.9074 - val_loss: 0.4831 - val_accuracy: 0.8497 Epoch 9/100 55/55 [==============================] - 0s 6ms/step - loss: 0.4196 - accuracy: 0.9136 - val_loss: 0.4747 - val_accuracy: 0.8497 Epoch 10/100 55/55 [==============================] - 0s 6ms/step - loss: 0.4034 - accuracy: 0.9195 - val_loss: 0.4697 - val_accuracy: 0.8394 Epoch 11/100 55/55 [==============================] - 0s 6ms/step - loss: 0.3986 - accuracy: 0.9086 - val_loss: 0.4604 - val_accuracy: 0.8549 Epoch 12/100 55/55 [==============================] - 0s 6ms/step - loss: 0.3875 - accuracy: 0.9169 - val_loss: 0.4549 - val_accuracy: 0.8549 Epoch 13/100 55/55 [==============================] - 0s 6ms/step - loss: 0.3798 - accuracy: 0.9175 - val_loss: 0.4499 - val_accuracy: 0.8497 Epoch 14/100 55/55 [==============================] - 0s 6ms/step - loss: 0.3731 - accuracy: 0.9200 - val_loss: 0.4458 - val_accuracy: 0.8497 Epoch 15/100 55/55 [==============================] - 0s 7ms/step - loss: 0.3659 - accuracy: 0.9196 - val_loss: 0.4424 - val_accuracy: 0.8549 Epoch 16/100 55/55 [==============================] - 0s 6ms/step - loss: 0.3593 - accuracy: 0.9211 - val_loss: 0.4441 - val_accuracy: 0.8290
<tensorflow.python.keras.callbacks.History at 0x2b49a577278>
model.evaluate(x_test, y_test.values, verbose=0) #손실, accuracy
[0.433194637298584, 0.8340249061584473]
[예측,실제]
prob=model.predict(
x_test,
batch_size=None,
verbose=0,
steps=None,
callbacks=None,
max_queue_size=10,
workers=1,
use_multiprocessing=False,
)
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score
predictional = np.where(prob >= 0.5, 1, 0)
confusion_matrix(y_test,predictional)
array([[265, 20], [ 60, 137]], dtype=int64)
TN,FP,FN,TP=confusion_matrix(y_test,predictional).ravel()
print(TN,FP,FN,TP)
265 20 60 137
print('정확도: ',accuracy_score(y_test,predictional))
print('정밀도: ',precision_score(y_test,predictional))
정확도: 0.8340248962655602 정밀도: 0.8726114649681529
prediction1 = np.where(prob >= 0.7,1,0)
confusion_matrix(y_test,prediction1)
TN,FP,FN,TP=confusion_matrix(y_test,prediction1).ravel()
print(TN,FP,FN,TP)
281 4 104 93
print('정확도: ',accuracy_score(y_test,prediction1))
print('정밀도: ',precision_score(y_test,prediction1))
정확도: 0.7759336099585062 정밀도: 0.9587628865979382