'''
文本特征学习-词矢量化 Word2vec 用Python包genism实现
'''
import gensim
from gensim.models import word2vec, Word2Vec
model = gensim.models.Word2Vec(sentences, min_count=1, size=20)
sentences = """How to Sound Like a Data Scientist
Types of Data
The Five Steps of Data Science
Basic Mathematics
A Gentle Introduction to Probability
Advanced Probability
Basic Statistics
Advanced Statistics
Communicating Data
Machine Learning Essentials
Beyond the Essentials
Case Studies """.split('\n')
'''词嵌入'''
def get_embedding(string):
try:
return model.wv[string]
except:
return None
vectorized_sentences = np.zeros((len(sentences),300))
for i, sentence in enumerate(sentences):
words = sentence.split(' ')
embedded_words = [get_embedding(w) for w in words]
embedded_words = filter(lambda x:x is not None, embedded_words)
vectorized_sentence = reduce(lambda x,y:x+y,embedded_words)/len(embedded_words)
vectorized_sentences[i:] = vectorized_sentence
vectorized_sentences.shape
# find similar text about math
reference_word = 'math'
best_sentence_idx =
np.dot(vectorized_sentences,get_embedding(reference_word)).argsort()[-3:][::-1]
print([sentences[b] for b in best_sentence_idx])
# find similar text are about AI
reference_word = 'AI'
best_sentence_idx = np.dot(vectorized_sentences,get_embedding(reference_word)).argsort()[-3:][::-1]
print([sentences[b] for b in best_sentence_idx])
作者:夜已.入深