import pandas as pd
review = pd.read_excel('data/data_all.xlsx')
review.head()
site | hotel | score | review | date | star | length | review_spell_check | helpful | attitude | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 야놀자 | 신라스테이 광화문 | 10.0 | 깔끔합니다 | 2019. 03. 30 | 5.0 | 5 | 깔끔합니다 | 0 | 2 |
1 | 야놀자 | 신라스테이 광화문 | 6.0 | 다니기는 참 좋은데 청소를 아예 안 하시네요 | 2020. 09. 13 | 3.0 | 24 | 다니기는 참 좋은데 청소를 아예 안 하시네요 | 1 | 0 |
2 | 야놀자 | 나인트리 프리미어 명동2 | 10.0 | 깔끔한 내부가 좋았습니다 | 2019. 11. 25 | 5.0 | 13 | 깔끔한 내부가 좋았습니다 | 0 | 2 |
3 | 아고다 | 나인트리 프리미어 호텔 명동 2 | 10.0 | 가격이 저렴해서 어딘가 아쉬운점은 한두개씩 있겠거니 하고 큰 기대없이 갔는데 생각외... | NaN | NaN | 285 | 가격이 저렴해서 어딘가 아쉬운 점은 한두 개씩 있겠거니 하고 큰 기대 없이 갔는데 ... | 1 | 2 |
4 | 아고다 | 나인트리 프리미어 호텔 명동 2 | 7.6 | 우선 뷰가 공사현장 보이고 주유소 보여서 그랬구요 새벽 6시쯤 넘어서는 공사장 소... | NaN | NaN | 93 | 우선 뷰가 공사현장 보이고 주유소 보여서 그랬고요 새벽 6시쯤 넘어서는 공사장 소... | 1 | 0 |
with open('review.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(review['review_spell_check']))
from sentencepiece import SentencePieceTrainer
sp = SentencePieceTrainer.Train('--input=review.txt --model_prefix=review --vocab_size=3000')
from sentencepiece import SentencePieceProcessor
sp = SentencePieceProcessor()
sp.Load("review.model")
True
sp.encode_as_pieces(review.loc[123, 'review'])
['▁여러', '지', '점', '▁가', '봤', '는데', '▁느낌', '은', '▁다', '▁비슷', '하고', '▁광화문점', '이', '▁서울', '여행', '하기', '에', '▁위치', '는', '▁좋은', '것', '같', '아요', '▁대신', '▁다', '른', '지', '점', '들', '보다', '▁조금', '은', '▁비싼', '▁', '감이', '▁있', '네', '요']
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(lowercase=False, tokenizer=sp.encode_as_pieces) #tokenizer를 안해주면, 띄어쓰기 단위로 잘라줌
tdm = cv.fit_transform(review['review_spell_check'])
from sklearn.model_selection import train_test_split
x = tdm
y = review['helpful']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
import joblib
cv.tokenizer = None # joblib로 저장하기 위해 필요
joblib.dump((cv, x_train, x_test, y_train, y_test), 'hotel_subword.pkl')
['hotel_subword.pkl']
cv, x_train, x_test, y_train, y_test = joblib.load('hotel_subword.pkl')
x_train.shape
(1974, 2996)
import tensorflow as tf
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(
1, #1,0 중 하나가 나온다! y자체는 열이 하나 ->Dense(1, )
input_shape = (2996, ), #input data는 3018개 : 튶플로 넣어주기
activation = 'sigmoid',
kernel_regularizer = tf.keras.regularizers.l1_l2(0, 0.001) #ㅣ1은 0만, l2는 0.001만
))
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 1) 2997 ================================================================= Total params: 2,997 Trainable params: 2,997 Non-trainable params: 0 _________________________________________________________________
model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics = ['accuracy'])
model.fit(x_train.toarray(), y_train.values, epochs=100, validation_split=0.1,
callbacks=[tf.keras.callbacks.EarlyStopping()])
Epoch 1/100 56/56 [==============================] - 2s 13ms/step - loss: 0.6719 - accuracy: 0.5976 - val_loss: 0.6259 - val_accuracy: 0.6919 Epoch 2/100 56/56 [==============================] - 0s 7ms/step - loss: 0.6040 - accuracy: 0.7105 - val_loss: 0.6012 - val_accuracy: 0.7273 Epoch 3/100 56/56 [==============================] - 0s 4ms/step - loss: 0.5805 - accuracy: 0.8178 - val_loss: 0.5837 - val_accuracy: 0.7879 Epoch 4/100 56/56 [==============================] - 0s 3ms/step - loss: 0.5654 - accuracy: 0.8499 - val_loss: 0.5709 - val_accuracy: 0.8131 Epoch 5/100 56/56 [==============================] - 0s 3ms/step - loss: 0.5350 - accuracy: 0.8580 - val_loss: 0.5592 - val_accuracy: 0.8232 Epoch 6/100 56/56 [==============================] - 0s 4ms/step - loss: 0.5144 - accuracy: 0.8745 - val_loss: 0.5496 - val_accuracy: 0.8232 Epoch 7/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4930 - accuracy: 0.8873 - val_loss: 0.5415 - val_accuracy: 0.8283 Epoch 8/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4804 - accuracy: 0.8946 - val_loss: 0.5345 - val_accuracy: 0.8131 Epoch 9/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4673 - accuracy: 0.8821 - val_loss: 0.5283 - val_accuracy: 0.8283 Epoch 10/100 56/56 [==============================] - 0s 3ms/step - loss: 0.4711 - accuracy: 0.8850 - val_loss: 0.5226 - val_accuracy: 0.8232 Epoch 11/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4448 - accuracy: 0.8943 - val_loss: 0.5181 - val_accuracy: 0.8182 Epoch 12/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4425 - accuracy: 0.8806 - val_loss: 0.5141 - val_accuracy: 0.8283 Epoch 13/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4353 - accuracy: 0.8842 - val_loss: 0.5096 - val_accuracy: 0.8283 Epoch 14/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4235 - accuracy: 0.8859 - val_loss: 0.5073 - val_accuracy: 0.8333 Epoch 15/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4072 - accuracy: 0.8934 - val_loss: 0.5039 - val_accuracy: 0.8333 Epoch 16/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4106 - accuracy: 0.8962 - val_loss: 0.5014 - val_accuracy: 0.8283 Epoch 17/100 56/56 [==============================] - 0s 3ms/step - loss: 0.3998 - accuracy: 0.9025 - val_loss: 0.4988 - val_accuracy: 0.8283 Epoch 18/100 56/56 [==============================] - 0s 4ms/step - loss: 0.4008 - accuracy: 0.8995 - val_loss: 0.4971 - val_accuracy: 0.8283 Epoch 19/100 56/56 [==============================] - 0s 4ms/step - loss: 0.3836 - accuracy: 0.8987 - val_loss: 0.4957 - val_accuracy: 0.8232 Epoch 20/100 56/56 [==============================] - 0s 3ms/step - loss: 0.3869 - accuracy: 0.9041 - val_loss: 0.4943 - val_accuracy: 0.8182 Epoch 21/100 56/56 [==============================] - 0s 7ms/step - loss: 0.3801 - accuracy: 0.9082 - val_loss: 0.4929 - val_accuracy: 0.8232 Epoch 22/100 56/56 [==============================] - 0s 5ms/step - loss: 0.3793 - accuracy: 0.9113 - val_loss: 0.4916 - val_accuracy: 0.8182 Epoch 23/100 56/56 [==============================] - 0s 3ms/step - loss: 0.3823 - accuracy: 0.8915 - val_loss: 0.4908 - val_accuracy: 0.8182 Epoch 24/100 56/56 [==============================] - 0s 4ms/step - loss: 0.3744 - accuracy: 0.9134 - val_loss: 0.4895 - val_accuracy: 0.8131 Epoch 25/100 56/56 [==============================] - 0s 3ms/step - loss: 0.3625 - accuracy: 0.9146 - val_loss: 0.4889 - val_accuracy: 0.8131 Epoch 26/100 56/56 [==============================] - 0s 3ms/step - loss: 0.3693 - accuracy: 0.8981 - val_loss: 0.4874 - val_accuracy: 0.8182 Epoch 27/100 56/56 [==============================] - 0s 3ms/step - loss: 0.3575 - accuracy: 0.9222 - val_loss: 0.4868 - val_accuracy: 0.8182 Epoch 28/100 56/56 [==============================] - 0s 7ms/step - loss: 0.3583 - accuracy: 0.9063 - val_loss: 0.4863 - val_accuracy: 0.8232 Epoch 29/100 56/56 [==============================] - 0s 6ms/step - loss: 0.3696 - accuracy: 0.9027 - val_loss: 0.4853 - val_accuracy: 0.8232 Epoch 30/100 56/56 [==============================] - 0s 5ms/step - loss: 0.3601 - accuracy: 0.9044 - val_loss: 0.4852 - val_accuracy: 0.8232 Epoch 31/100 56/56 [==============================] - 0s 5ms/step - loss: 0.3545 - accuracy: 0.9117 - val_loss: 0.4852 - val_accuracy: 0.8182 Epoch 32/100 56/56 [==============================] - 0s 4ms/step - loss: 0.3641 - accuracy: 0.8983 - val_loss: 0.4848 - val_accuracy: 0.8131 Epoch 33/100 56/56 [==============================] - 0s 4ms/step - loss: 0.3624 - accuracy: 0.9114 - val_loss: 0.4841 - val_accuracy: 0.8131 Epoch 34/100 56/56 [==============================] - 0s 4ms/step - loss: 0.3561 - accuracy: 0.9129 - val_loss: 0.4842 - val_accuracy: 0.8131
<tensorflow.python.keras.callbacks.History at 0x244feef0860>
model.evaluate(x_test.toarray(), y_test.values, verbose=0) #손실, accuracy
[0.5522161722183228, 0.8036437034606934]
weights, _ = model.trainable_weights
token_weight = pd.DataFrame({'토큰': cv.get_feature_names(), '가중치': weights.numpy().flat})
token_weight.sort_values('가중치').head()
토큰 | 가중치 | |
---|---|---|
1434 | ▁좋아요 | -0.741736 |
337 | ▁깔끔 | -0.447911 |
295 | ▁굿 | -0.410221 |
1605 | ▁친절합니다 | -0.371433 |
274 | ▁괜찮아 | -0.368910 |
token_weight.sort_values('가중치').tail()
토큰 | 가중치 | |
---|---|---|
1689 | ▁편의점 | 0.457330 |
738 | ▁방음 | 0.504998 |
193 | ▁객실 | 0.540762 |
398 | ▁넓 | 0.561374 |
851 | ▁빼고 | 0.578300 |