728x90
반응형
In [119]:
import requests
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537'}
def getDownload(url, param = None, retries = 3):
resp = None
try:
resp = requests.get(url, params = param, headers = headers)
resp.raise_for_status()
except requests.exceptions.HTTPError as e:
if 500 <= resp.status_code < 600 and retries > 0:
print('Retries : {0}'.format(retries))
return getDownload(url, param, retries -1)
else:
print(resp.status_code)
print(resp.reason)
print(resp.request.headers)
return resp
In [120]:
from bs4 import BeautifulSoup
In [269]:
url = 'https://movie.naver.com/movie/point/af/list.nhn?'
params = {'page':}
In [270]:
html = getDownload(url,params)
dom = BeautifulSoup(html.text,'html.parser')
In [275]:
import tqdm
In [280]:
starList = list()
titleList = list()
reviewList = list()
movie = list()
for n in tqdm.tqdm_notebook(range(1,1001)):
url = 'https://movie.naver.com/movie/point/af/list.nhn?'
params = {'page':n}
html = getDownload(url,params)
dom = BeautifulSoup(html.text,'html.parser')
# 평점 가져오기
for tag in tqdm.tqdm_notebook(dom.select('.list_netizen .point')):
starList.append(tag.text)
# 제목, 리뷰 가져와서 split
for tag in tqdm.tqdm_notebook(dom.select('table tbody tr .title')):
txt = tag.text.strip()
movie.append(txt)
In [282]:
for i in range(len(movie)):
t = movie[i].split('\n')[0]
r = movie[i].split('\n')[1]
titleList.append(t)
reviewList.append(r)
In [287]:
len(starList),len(titleList),len(reviewList)
Out[287]:
In [335]:
a = '어벤져스 앤드게임이, 마지막이라니, 뭔가 그리울것 같네요! ㅠㅠ '
In [336]:
from string import punctuation
a.translate(str.maketrans('', '', punctuation))
Out[336]:
In [484]:
import re
def getCleanText(item):
return re.sub(r"[\s]{2,}"," ",item.replace('',''," "))
In [312]:
# csv 만들기 위해 콤마 제거
reviewSub = list()
from string import punctuation
for t in reviewList:
tsub = t.translate(str.maketrans('', '', punctuation))
reviewSub.append(tsub)
In [317]:
movieDF = pd.DataFrame({"Star":starList, "Title":titleList, "Review":reviewSub})
In [318]:
pwd
Out[318]:
In [ ]:
# CSV 저장
# movieDF.to_csv('movie_review.csv',sep=',',mode='w')
In [719]:
# index_col = 0으로 설정해야 unnamed가 사라짐
data = pd.read_csv('movie_review.csv',sep=',')
del data['Unnamed: 0']
data = data[data['Review'].notnull()]
In [720]:
len(data)
Out[720]:
In [721]:
data.head()
Out[721]:
In [534]:
data.count()
Out[534]:
In [473]:
# 10점이 너무 많아서 10점 빼고 분석할 것!
data.groupby('Star').count()
Out[473]:
In [622]:
upperbound = 7
lowerbound = 3
data = data[data['Star'] < 10]
positive = data[data['Star'] > upperbound]
negative = data[data['Star'] <= lowerbound]
In [623]:
len(positive),len(negative)
Out[623]:
In [590]:
from collections import defaultdict, Counter
from konlpy.tag import Komoran,Kkma
positiveDTM = defaultdict(Counter)
# pos 1 => {"형태소/품사" = 횟수,"형태소/품사" = 횟수,"형태소/품사" = 횟수,...}
negativeDTM = defaultdict(Counter)
# neg 1 => {"형태소/품사" = 횟수,"형태소/품사" = 횟수,"형태소/품사" = 횟수,...}
ma = Komoran()
for i, review in enumerate(positive['Review']):
posList = ma.pos(review)
for term in posList:
if len(term[0]) > 1:
positiveDTM[i]["/".join(term)] += 1 # 형태소/품사
In [624]:
try:
for i, review in enumerate(negative['Review']):
posList = ma.pos(review)
for term in posList:
if len(term[0]) > 1:
negativeDTM[i]["/".join(term)] += 1
except:
print('Null.')
In [625]:
len(negativeDTM),len(positiveDTM)
Out[625]:
In [626]:
positiveTDM = defaultdict(Counter)
negativeTDM = defaultdict(Counter)
for i, termList in positiveDTM.items():
for term,freq in termList.items():
positiveTDM[term][i] = freq
for i, termList in negativeDTM.items():
for term,freq in termList.items():
negativeTDM[term][i] = freq
In [627]:
positiveLexicon = list(set(positiveTDM.keys()))
negativeLexicon = list(set(positiveTDM.keys()))
positiveDocuments = list(positiveDTM.keys())
negativeDocuments = list(positiveDTM.keys())
In [628]:
positivePOS = defaultdict(Counter)
negativePOS = defaultdict(Counter)
for term in positiveLexicon:
tokens = term.split("/")
# 형태소/품사 -> tokens[0]: 형태소 (어/벤/져/스) [1] : 품사
# TDM[term] -> 어느 문서에서 몇 번
positivePOS[tokens[-1]][tokens[0]] += sum(positiveTDM[term].values())# 같은 단어일 수 도 있어서 합침
# 명사 -> 단어1 : positive 내 전체 등장횟수, 단어2:,단어3:
for term in negativeLexicon:
tokens = term.split('/')
negativePOS[tokens[-1]][tokens[0]] += sum(negativeTDM[term].values())
In [629]:
positivePOS.keys()
Out[629]:
In [630]:
# 보통 명사
positivePOS['NNG'].most_common()[:20]
Out[630]:
In [631]:
negativePOS["NNG"].most_common()[:20]
Out[631]:
In [632]:
Komoran().tagset
Out[632]:
In [675]:
from konlpy.tag import Komoran
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
tagList = Komoran().tagset
plt.rcParams['font.family'] = 'NanumBarunGothic'
plt.rcParams['axes.unicode_minus'] = False
rc("figure", figsize=(10,5))
for posName in positivePOS.keys():
pNorm = sum(positivePOS[posName].values())
pTermList = [pair[0] for pair in positivePOS[posName].most_common()[:20]]
pFreqList = [pair[1]/pNorm for pair in positivePOS[posName].most_common()[:20]]
for negName in negativePOS.keys():
nNorm = sum(negativePOS[posName].values())
nTermList = [pair[0] for pair in negativePOS[negName].most_common()[:20]]
nFreqList = [pair[1]/nNorm for pair in negativePOS[negName].most_common()[:20]]
plt.title("{0}/{1}".format(posName, tagList[posName]))
plt.bar(pTermList, pFreqList, align="center")
plt.bar(nTermList, nFreqList, align="center")
plt.xticks(rotation=90)
plt.show()
In [ ]:
# PMI는 correlation이고 이를 가지 SO를 통해 값을 얻어서 Pos / Neg 의미 판단
# SO = Pos(PMI(term,y)) - Neg(PMI(term,y))
# +,-는 Strength가 됨
# 다 하고
# pSeed = ["재밌"]
# nSeed = ["지루"] 라고 해놓고 DTM, TDM 으로 하나로 합쳐줌
# positivem negative 신경쓰지 말기
In [664]:
from math import log
K = 0.1
N = len(positiveDTM) + len(negativeDTM)
positivePMI = defaultdict(float)
positiveNPMI = defaultdict(float)
seedDocList = list(positiveDTM.keys())
seedLikelihood = len(seedDocList)/N
for term in list(set(positiveLexicon + negativeLexicon)):
termDocList = list(positiveTDM[term].keys()) + list(negativeTDM[term].keys())
termLikelihood = len(termDocList)/N
jointLikelihood = (len(positiveTDM[term].keys()) + K)/N
multiLikelihood = seedLikelihood * termLikelihood
positivePMI[term] = log(jointLikelihood/multiLikelihood)
positiveNPMI[term] = positivePMI[term]/-log(jointLikelihood)
In [665]:
negativePMI = defaultdict(float)
negativeNPMI = defaultdict(float)
seedDocList = list(negativeDTM.keys())
seedLikelihood = len(seedDocList)/N
for term in list(set(positiveLexicon + negativeLexicon)):
termDocList = list(positiveTDM[term].keys()) + list(negativeTDM[term].keys())
termLikelihood = (len(termDocList) + K)/N
jointLikelihood = (len(negativeTDM[term].keys()) + K)/N
multiLikelihood = seedLikelihood * termLikelihood
negativePMI[term] = log(jointLikelihood/multiLikelihood)
negativeNPMI[term] = negativePMI[term]/-log(jointLikelihood)
In [666]:
pmiSO = defaultdict(float)
invertedPmiSO = defaultdict(float)
npmiSO = defaultdict(float)
invertedNpmiSO = defaultdict(float)
for term in list(set(positiveLexicon + negativeLexicon)):
pmiSO[term.split("/")[0]] += positivePMI[term] - negativePMI[term]
invertedPmiSO[term.split("/")[0]] -= positivePMI[term] - negativePMI[term]
npmiSO[term.split("/")[0]] += positiveNPMI[term] - negativeNPMI[term]
invertedNpmiSO[term.split("/")[0]] -= positiveNPMI[term] - negativeNPMI[term]
In [667]:
print(sorted(pmiSO.items(), key=lambda x:x[1], reverse=True)[:10])
print(sorted(npmiSO.items(), key=lambda x:x[1], reverse=True)[:10])
print(sorted(pmiSO.items(), key=lambda x:x[1], reverse=False)[:10])
print(sorted(npmiSO.items(), key=lambda x:x[1], reverse=False)[:10])
In [700]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
fontPath = "/Library/Fonts/AppleGothic.ttf"
def drawCloud(words):
wc = WordCloud(font_path=fontPath, max_words=30, background_color="white")
wc.generate_from_frequencies(words)
return wc.to_image()
In [703]:
# Positive
drawCloud(dict(sorted(pmiSO.items(), key=lambda x:x[1], reverse=True)[:30]))
Out[703]:
In [704]:
# Negative
drawCloud(dict(sorted(invertedPmiSO.items(), key=lambda x:x[1], reverse=True)[:30]))
Out[704]:
728x90
반응형