728x90
반응형
In [1]:
from konlpy.tag import Kkma
from nltk.tokenize import word_tokenize
sentence = '오늘 날씨는 어제 날씨보다 안좋은거 같아요'
In [2]:
print(Kkma().pos(sentence))
In [3]:
for token in word_tokenize(sentence): # == sentence.split():
print(Kkma().nouns(token))
In [4]:
# NNG만 걸러내기
for token in [word for word in Kkma().pos(sentence) if word[1].startswith('NNG')]:
print(token)
In [5]:
for token in [word for word in Kkma().pos(sentence) if word[1] in ['NNG','VA'] and len(word[0]) > 1]:
print(token)
In [6]:
def ngramEojeol(sentence, n=2):
tokens = sentence.split()
ngram = []
for i in range(len(tokens) - n + 1):
ngram.append(' '.join(tokens[i:i + n]))
return ngram
In [7]:
def ngramUmjeol(term, n=2):
ngram = []
for i in range(len(term) - n + 1):
ngram.append(''.join(term[i:i + n]))
return ngram
In [8]:
# 색인어 목록
sentence = '오늘 날씨는 어제 날씨보다 안좋은거 같아요'
lexicon = list(set(sentence.split()))
print('1. ', lexicon)
# 색인어 목록 확장(품사, 단어 길이 고려)
for token in [word for word in Kkma().pos(sentence) if word[1] in ['NNG','VA'] and len(word[0]) > 1]:
lexicon.append(token[0])
print('2. ', list(set(lexicon)))
# 어절, 특정 품사&길이 형태소, Ngram(어절)
lexicon.extend(ngramEojeol(sentence))
print('3. ', list(set(lexicon)))
lexicon.extend(ngramEojeol(' '.join([token[0] for token in [word for word in Kkma().pos(sentence) if word[1] in ['NNG','VA'] and len(word[0]) > 1]])))
print('4. ',list(set(lexicon)))
newLexicon = list()
for word in lexicon:
if len(word.split()) == 1:
newLexicon.extend(ngramUmjeol(word))
lexicon.extend(newLexicon)
print('5. ', list(set(lexicon)))
In [9]:
import os
def getFileList(base='./', ext='.txt'):
fileList = list()
for file in os.listdir(base):
if file.endswith(ext): # == if file.split('.')[-1] == ext:
fileList.append('{0}/{1}'.format(base, file))
return fileList
In [12]:
len(getFileList('./News'))
Out[12]:
In [13]:
def getContent(file):
with open(file, encoding='utf-8') as f:
content = f.read()
return content
In [14]:
from nltk.tokenize import sent_tokenize
corpus = getContent(getFileList('./News')[0])
sent_tokenize(corpus)
Out[14]:
In [15]:
import re
from string import punctuation
print(punctuation)
In [16]:
# 구두점 제거 (2번이상 반복되는 경우)
pattern = re.compile(r'[%s]{2,}' % re.escape(punctuation)) # re.compile(r'//')
pattern.findall(corpus)
Out[16]:
In [17]:
# white space 2번 이상 나오는 경우
pattern = re.compile(r'\s{2,}')
pattern.findall(corpus)
Out[17]:
In [18]:
# 영어 (대소문자) +\-\_ 제거 (8글자 이상)
pattern = re.compile(r'[A-Za-z-_]{8,}')
pattern.findall(corpus)
Out[18]:
In [19]:
# 이메일
# \w => Alphanumeric + -
# naver .com .co .kr
pattern = re.compile(r'(\w+@[A-Za-z0-9\-\_]{3,}(.[A-Za-z]{2,})+)')
pattern.findall(corpus)
Out[19]:
In [20]:
# 한글 이외에 모두 제거
pattern = re.compile(r'([^ㄱ-ㅎㅏ-ㅣ가-힣]+)')
# pattern.findall(corpus)
In [21]:
def getPatternList():
patternList = {}
patternList['Korean'] = re.compile(r'([^ㄱ-ㅎㅏ-ㅣ가-힣]+)')
patternList['Email'] = re.compile(r'(\w+@[a-zA-Z0-9\-\_]{3,}(.[a-zA-Z]{2,})+)')
patternList['Whitespace'] = re.compile(r'\s{2,}')
patternList['Punctuation'] = re.compile(r'[%s]{2,}' % re.escape(punctuation))
return patternList
In [22]:
corpus = getContent(getFileList('./News')[0])
corpus = getPatternList()['Korean'].sub(" ", corpus)
corpus = getPatternList()['Punctuation'].sub(" ", corpus)
In [23]:
dictTerm = list()
dictPos = list()
dictNoun = list()
dictNgram = list()
for sentence in sent_tokenize(corpus):
for token in sentence.split():
if len(token) > 1:
dictTerm.append(token)
dictPos.extend([morpheme for morpheme in Kkma().morphs(token) if len(morpheme) > 1])
dictNoun.extend([noun for noun in Kkma().nouns(token) if len(noun) > 1])
dictNgram.extend(ngramUmjeol(token))
dictTerm = list(set(dictTerm))
dictPos = list(set(dictPos))
dictNoun = list(set(dictNoun))
dictNgram = list(set(dictNgram))
In [24]:
len(dictTerm), len(dictPos), len(dictNoun), len(dictNgram)
Out[24]:
In [25]:
len(list(set(dictTerm + dictPos + dictNoun+ dictNgram)))
Out[25]:
In [26]:
print(dictTerm)
In [27]:
print(dictPos)
In [28]:
print(dictNoun)
In [29]:
print(dictNgram)
728x90
반응형