test code

sudori
Commit 89fc8479e910d4e7d975fbf47ecddad23ce3e5f4 89fc8479 1 parent 80efabcf
Showing 1 changed file with 192 additions and 0 deletions
code/sentimentAnalysis.py
--- a/code/sentimentAnalysis.py 0 → 100644
View file @89fc847
+++ b/code/sentimentAnalysis.py 0 → 100644
View file @89fc847
+import pandas as pd
+import numpy as np
+%matplotlib inline
+import matplotlib.pyplot as plt
+import re
+import urllib.request
+from konlpy.tag import Okt
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.layers import Embedding, Dense, LSTM
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.models import load_model
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+
+
+urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
+urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")
+
+train_data = pd.read_table('ratings_train.txt')
+test_data = pd.read_table('ratings_test.txt')
+
+# 리뷰 내용이 잘리지 않도록 전체 샘플 중 길이가 max_len 이하인 샘플의 비율이 몇 %인지 확인하는 함수를 만든다.
+def below_threshold_len(max_len, nested_list):
+  cnt = 0
+  for s in nested_list:
+    if(len(s) <= max_len):
+        cnt = cnt + 1
+  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))*100))
+
+# 예측 함수
+def sentiment_predict(new_sentence):
+  new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
+  new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
+  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
+  pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
+  score = float(loaded_model.predict(pad_new)) # 예측
+  if(score > 0.5):
+    print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
+  else:
+    print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))
+
+print('훈련용 리뷰 개수 :',len(train_data)) # 훈련용 리뷰 개수 출력
+
+train_data[:5] # 상위 5개 출력
+print('테스트용 리뷰 개수 :',len(test_data)) # 테스트용 리뷰 개수 출력
+
+test_data[:5]
+train_data['document'].nunique(), train_data['label'].nunique()
+
+train_data.drop_duplicates(subset=['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
+
+print('총 샘플의 수 :',len(train_data))
+
+train_data['label'].value_counts().plot(kind = 'bar')
+
+print(train_data.groupby('label').size().reset_index(name = 'count'))
+
+print(train_data.isnull().values.any())
+
+print(train_data.isnull().sum())
+
+train_data.loc[train_data.document.isnull()]
+
+train_data = train_data.dropna(how = 'any') # Null 값이 존재하는 행 제거
+print(train_data.isnull().values.any()) # Null 값이 존재하는지 확인
+
+print(len(train_data))
+
+text = 'do!!! you expect... people~ to~ read~ the FAQ, etc. and actually accept hard~! atheism?@@'
+re.sub(r'[^a-zA-Z ]', '', text) #알파벳과 공백을 제외하고 모두 제거
+
+train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
+# 한글과 공백을 제외하고 모두 제거
+train_data[:5]
+
+train_data['document'].replace('', np.nan, inplace=True)
+print(train_data.isnull().sum())
+
+train_data.loc[train_data.document.isnull()][:5]
+
+train_data = train_data.dropna(how = 'any')
+print(len(train_data))
+
+test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
+test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
+test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
+test_data = test_data.dropna(how='any') # Null 값 제거
+print('전처리 후 테스트용 샘플의 개수 :',len(test_data))
+
+stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
+
+okt = Okt()
+okt.morphs('와 이런 것도 영화라고 차라리 뮤직비디오를 만드는 게 나을 뻔', stem = True)
+
+X_train = []
+for sentence in train_data['document']:
+    temp_X = []
+    temp_X = okt.morphs(sentence, stem=True) # 토큰화
+    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
+    X_train.append(temp_X)
+
+print(X_train[:3])
+
+X_test = []
+for sentence in test_data['document']:
+    temp_X = []
+    temp_X = okt.morphs(sentence, stem=True) # 토큰화
+    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
+    X_test.append(temp_X)
+
+# 정수 인코딩
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(X_train)
+print(tokenizer.word_index)
+
+threshold = 3
+total_cnt = len(tokenizer.word_index) # 단어의 수
+rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
+total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
+rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합
+
+# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
+for key, value in tokenizer.word_counts.items():
+    total_freq = total_freq + value
+
+    # 단어의 등장 빈도수가 threshold보다 작으면
+    if(value < threshold):
+        rare_cnt = rare_cnt + 1
+        rare_freq = rare_freq + value
+
+print('단어 집합(vocabulary)의 크기 :',total_cnt)
+print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
+print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
+print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)
+
+# 전체 단어 개수 중 빈도수 2이하인 단어 개수는 제거.
+# 0번 패딩 토큰과 1번 OOV 토큰을 고려하여 +2
+vocab_size = total_cnt - rare_cnt + 2
+print('단어 집합의 크기 :',vocab_size)
+
+tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
+tokenizer.fit_on_texts(X_train)
+X_train = tokenizer.texts_to_sequences(X_train)
+X_test = tokenizer.texts_to_sequences(X_test)
+print(X_train[:3])
+
+y_train = np.array(train_data['label'])
+y_test = np.array(test_data['label'])
+
+drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]
+
+# 빈 샘플들을 제거
+X_train = np.delete(X_train, drop_train, axis=0)
+y_train = np.delete(y_train, drop_train, axis=0)
+print(len(X_train))
+print(len(y_train))
+
+# 패딩
+print('리뷰의 최대 길이 :',max(len(l) for l in X_train))
+print('리뷰의 평균 길이 :',sum(map(len, X_train))/len(X_train))
+plt.hist([len(s) for s in X_train], bins=50)
+plt.xlabel('length of samples')
+plt.ylabel('number of samples')
+plt.show()
+
+max_len = 30
+below_threshold_len(max_len, X_train)
+X_train = pad_sequences(X_train, maxlen = max_len)
+X_test = pad_sequences(X_test, maxlen = max_len)
+
+# LSTM 영화 리뷰 감성 분류 모델 제작
+model = Sequential()
+model.add(Embedding(vocab_size, 100))
+model.add(LSTM(128))
+model.add(Dense(1, activation='sigmoid'))
+
+es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
+mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
+model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
+history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)
+
+# best model load
+loaded_model = load_model('best_model.h5')
+print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))
+
+sentiment_predict('이 영화 넘재밌어')
+sentiment_predict('감독 뭐하는 놈이냐?')
+sentiment_predict('와 개쩐다 정말 세계관 최강자들의 영화다')
+sentiment_predict('눈물이 주륵주륵 흐르네 그냥 ㅋㅋ')
+sentiment_predict('진짜 개재밌다 ㅋㅋㅋㅋㅋ')
+sentiment_predict('절대 보지마라 개 쓰레기영화 ㅋㅋ')
+sentiment_predict('아유 개 씹노잼 영화 돈버린다')
\ No newline at end of file