Python

word2vec 기반 연관어 분석

이부일 2020. 3. 4. 12:46

# 패키지 로딩하기
import pandas as pd
import numpy as np
import glob
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from gensim.models.word2vec import Word2Vec

 

 

# 데이터 읽어오기/벡터로 만들기
pos_review = glob.glob("d:/deeplearning/textmining/pos/*.txt")[0:100]
pos_lines = []
for i in pos_review:
     try:
          f = open(i, "r")
          temp = f.readlines()[0]
          pos_lines.append(temp)
          f.close
     except Exception as e:
          continue

 

len(pos_lines)

 

 

# 단어 추출하기
stop_words = stopwords.words("english")
tokenizer = RegexpTokenizer("[\w]+")

text = []
for line in pos_lines:
     words = line.lower()
     tokens = tokenizer.tokenize(words)
     stopped_tokens = [i for i in list(set(tokens)) if not i in stop_words + ["br"]]
     stopped_tokens2 = [i for i in stopped_tokens if len(i) > 1]
     text.append(stopped_tokens2)

 

 

# word2vec 기반의 연관어 분석
model = Word2Vec(text, sg = 1, window = 2, min_count = 3)
model.init_sims(replace = True)
model.wv.similarity("film", "movie")
model.wv.most_similar("good", topn = 10)

 

[출처] 잡아라! 텍스트마이닝 with 파이썬, 서대호 지음, BJ, p124~126