test code

sudori
Commit 89fc8479e910d4e7d975fbf47ecddad23ce3e5f4 89fc8479 1 parent 80efabcf
Showing 1 changed file with 192 additions and 0 deletions
code/sentimentAnalysis.py
--- a/code/sentimentAnalysis.py 0 → 100644
View file @89fc847
+++ b/code/sentimentAnalysis.py 0 → 100644
View file @89fc847
+ import pandas as pd
+ import numpy as np
+ %matplotlib inline
+ import matplotlib.pyplot as plt
+ import re
+ import urllib.request
+ from konlpy.tag import Okt
+ from tensorflow.keras.preprocessing.text import Tokenizer
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
+ from tensorflow.keras.layers import Embedding, Dense, LSTM
+ from tensorflow.keras.models import Sequential
+ from tensorflow.keras.models import load_model
+ from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+ 
+ 
+ urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
+ urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")
+ 
+ train_data = pd.read_table('ratings_train.txt')
+ test_data = pd.read_table('ratings_test.txt')
+ 
+ # 리뷰 내용이 잘리지 않도록 전체 샘플 중 길이가 max_len 이하인 샘플의 비율이 몇 %인지 확인하는 함수를 만든다.
+ def below_threshold_len(max_len, nested_list):
+   cnt = 0
+   for s in nested_list:
+     if(len(s) <= max_len):
+         cnt = cnt + 1
+   print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))*100))
+ 
+ # 예측 함수
+ def sentiment_predict(new_sentence):
+   new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
+   new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
+   encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
+   pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
+   score = float(loaded_model.predict(pad_new)) # 예측
+   if(score > 0.5):
+     print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
+   else:
+     print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))
+ 
+ print('훈련용 리뷰 개수 :',len(train_data)) # 훈련용 리뷰 개수 출력
+ 
+ train_data[:5] # 상위 5개 출력
+ print('테스트용 리뷰 개수 :',len(test_data)) # 테스트용 리뷰 개수 출력
+ 
+ test_data[:5]
+ train_data['document'].nunique(), train_data['label'].nunique()
+ 
+ train_data.drop_duplicates(subset=['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
+ 
+ print('총 샘플의 수 :',len(train_data))
+ 
+ train_data['label'].value_counts().plot(kind = 'bar')
+ 
+ print(train_data.groupby('label').size().reset_index(name = 'count'))
+ 
+ print(train_data.isnull().values.any())
+ 
+ print(train_data.isnull().sum())
+ 
+ train_data.loc[train_data.document.isnull()]
+ 
+ train_data = train_data.dropna(how = 'any') # Null 값이 존재하는 행 제거
+ print(train_data.isnull().values.any()) # Null 값이 존재하는지 확인
+ 
+ print(len(train_data))
+ 
+ text = 'do!!! you expect... people~ to~ read~ the FAQ, etc. and actually accept hard~! atheism?@@'
+ re.sub(r'[^a-zA-Z ]', '', text) #알파벳과 공백을 제외하고 모두 제거
+ 
+ train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
+ # 한글과 공백을 제외하고 모두 제거
+ train_data[:5]
+ 
+ train_data['document'].replace('', np.nan, inplace=True)
+ print(train_data.isnull().sum())
+ 
+ train_data.loc[train_data.document.isnull()][:5]
+ 
+ train_data = train_data.dropna(how = 'any')
+ print(len(train_data))
+ 
+ test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
+ test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
+ test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
+ test_data = test_data.dropna(how='any') # Null 값 제거
+ print('전처리 후 테스트용 샘플의 개수 :',len(test_data))
+ 
+ stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
+ 
+ okt = Okt()
+ okt.morphs('와 이런 것도 영화라고 차라리 뮤직비디오를 만드는 게 나을 뻔', stem = True)
+ 
+ X_train = []
+ for sentence in train_data['document']:
+     temp_X = []
+     temp_X = okt.morphs(sentence, stem=True) # 토큰화
+     temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
+     X_train.append(temp_X)
+ 
+ print(X_train[:3])
+ 
+ X_test = []
+ for sentence in test_data['document']:
+     temp_X = []
+     temp_X = okt.morphs(sentence, stem=True) # 토큰화
+     temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
+     X_test.append(temp_X)
+ 
+ # 정수 인코딩
+ tokenizer = Tokenizer()
+ tokenizer.fit_on_texts(X_train)
+ print(tokenizer.word_index)
+ 
+ threshold = 3
+ total_cnt = len(tokenizer.word_index) # 단어의 수
+ rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
+ total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
+ rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합
+ 
+ # 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
+ for key, value in tokenizer.word_counts.items():
+     total_freq = total_freq + value
+ 
+     # 단어의 등장 빈도수가 threshold보다 작으면
+     if(value < threshold):
+         rare_cnt = rare_cnt + 1
+         rare_freq = rare_freq + value
+ 
+ print('단어 집합(vocabulary)의 크기 :',total_cnt)
+ print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
+ print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
+ print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)
+ 
+ # 전체 단어 개수 중 빈도수 2이하인 단어 개수는 제거.
+ # 0번 패딩 토큰과 1번 OOV 토큰을 고려하여 +2
+ vocab_size = total_cnt - rare_cnt + 2
+ print('단어 집합의 크기 :',vocab_size)
+ 
+ tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
+ tokenizer.fit_on_texts(X_train)
+ X_train = tokenizer.texts_to_sequences(X_train)
+ X_test = tokenizer.texts_to_sequences(X_test)
+ print(X_train[:3])
+ 
+ y_train = np.array(train_data['label'])
+ y_test = np.array(test_data['label'])
+ 
+ drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]
+ 
+ # 빈 샘플들을 제거
+ X_train = np.delete(X_train, drop_train, axis=0)
+ y_train = np.delete(y_train, drop_train, axis=0)
+ print(len(X_train))
+ print(len(y_train))
+ 
+ # 패딩
+ print('리뷰의 최대 길이 :',max(len(l) for l in X_train))
+ print('리뷰의 평균 길이 :',sum(map(len, X_train))/len(X_train))
+ plt.hist([len(s) for s in X_train], bins=50)
+ plt.xlabel('length of samples')
+ plt.ylabel('number of samples')
+ plt.show()
+ 
+ max_len = 30
+ below_threshold_len(max_len, X_train)
+ X_train = pad_sequences(X_train, maxlen = max_len)
+ X_test = pad_sequences(X_test, maxlen = max_len)
+ 
+ # LSTM 영화 리뷰 감성 분류 모델 제작
+ model = Sequential()
+ model.add(Embedding(vocab_size, 100))
+ model.add(LSTM(128))
+ model.add(Dense(1, activation='sigmoid'))
+ 
+ es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
+ mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
+ model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
+ history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)
+ 
+ # best model load
+ loaded_model = load_model('best_model.h5')
+ print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))
+ 
+ sentiment_predict('이 영화 넘재밌어')
+ sentiment_predict('감독 뭐하는 놈이냐?')
+ sentiment_predict('와 개쩐다 정말 세계관 최강자들의 영화다')
+ sentiment_predict('눈물이 주륵주륵 흐르네 그냥 ㅋㅋ')
+ sentiment_predict('진짜 개재밌다 ㅋㅋㅋㅋㅋ')
+ sentiment_predict('절대 보지마라 개 쓰레기영화 ㅋㅋ')
+ sentiment_predict('아유 개 씹노잼 영화 돈버린다')
\ No newline at end of file