[更新中]NLP – 词向量近义词Word2Vec、GloVe和BERT

近义词——词向量相似度

　　在求词向量相似度的时候经常会用到以下几种方法：

余弦相似度
曼哈顿距离(L1范数)
欧几里得距离(L2范数)
明式距离（是前两种距离测度的推广），在极限情况下的距离是切比雪夫距离

　　其中最常用的是余弦相似度，它表示的是两个向量之间夹角的余弦值。

　　两个向量间的余弦值可以通过使用欧几里得点积公式求出：

$\mathbf{a} \cdot \mathbf{b}=||\mathbf{a}||||\mathbf{b}|| \cos \theta$

　　余弦相似性cos(θ)的计算公式为：

similarity =\cos (\theta)=\displaystyle\frac{A \cdot B}{||A|||| B ||}=\displaystyle\frac{\displaystyle\sum_{i=1}^{n} A_{i} \times B_{i}}{\sqrt{\displaystyle\sum_{i=1}^{n}\left(A_{i}\right)^{2}} \times \sqrt{\displaystyle\sum_{i=1}^{n}\left(B_{i}\right)^{2}}}

　　因此，在python中求词向量的同义词的方法为：计算每个其他词语向量与这个词语向量的余弦相似度，然后找出最大的那个(使用np.argmax()函数)。

W = np.zeros((vocab_size, vector_dim))
for word, v in vectors.items():
    if word == '<unk>':
        continue
    W[vocab[word], :] = v

W_norm = np.zeros(W.shape)
d = (np.sum(W ** 2, 1) ** (0.5))
W_norm = (W.T / d).T

W = W_norm
indices = np.array([[vocab[word] for word in row] for row in data])
ind1, ind2, ind3, ind4 = indices.T

predictions = np.zeros((len(indices),))
num_iter = int(np.ceil(len(indices) / float(split_size)))
for j in range(num_iter):
    subset = np.arange(j * split_size, min((j + 1) * split_size, len(ind1)))

    pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
                + W[ind3[subset], :])
    # 余弦相似度 if input W has been normalized
    dist = np.dot(W, pred_vec.T)

    for k in range(len(subset)):
        dist[ind1[subset[k]], k] = -np.Inf
        dist[ind2[subset[k]], k] = -np.Inf
        dist[ind3[subset[k]], k] = -np.Inf

    # predicted word index
    predictions[subset] = np.argmax(dist, 0).flatten()

W = np.zeros((vocab_size, vector_dim))

for word, v in vectors.items():

if word == '<unk>':

continue

W[vocab[word], :] = v

W_norm = np.zeros(W.shape)

d = (np.sum(W ** 2, 1) ** (0.5))

W_norm = (W.T / d).T

W = W_norm

indices = np.array([[vocab[word] for word in row] for row in data])

ind1, ind2, ind3, ind4 = indices.T

predictions = np.zeros((len(indices),))

num_iter = int(np.ceil(len(indices) / float(split_size)))

for j in range(num_iter):

subset = np.arange(j * split_size, min((j + 1) * split_size, len(ind1)))

pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]

+ W[ind3[subset], :])

# 余弦相似度 if input W has been normalized

dist = np.dot(W, pred_vec.T)

for k in range(len(subset)):

dist[ind1[subset[k]], k] = -np.Inf

dist[ind2[subset[k]], k] = -np.Inf

dist[ind3[subset[k]], k] = -np.Inf

# predicted word index

predictions[subset] = np.argmax(dist, 0).flatten()

Word2Vec

'''
训练词向量
'''
from gensim.models import word2vec

with open('corpus_space.txt', 'r') as f:
    sentences = f.readlines()
    sentences = [s.split() for s in sentences]
    # size表示词向量维度 iter表示迭代次数
    model = word2vec.Word2Vec(sentences, window=5, min_count=2, size=300, iter=30, workers=3)
    #model = Word2Vec.load("word2vec.model")
    #model.wv.save('vectors_300d_word2vec')  # 保存训练过程
    #model.save('vectors_300d_word2vec')  # 保存训练过程
    model.wv.save_word2vec_format('vectors_300d_word2vec.txt')  # 仅保留词向量


'''
近义词
'''
from gensim.models.keyedvectors import KeyedVectors

'''
sgns.wiki.word 是词语和300d向量对应的纯文本格式
下载地址 https://pan.baidu.com/s/1AmXYWVgkxrG4GokevPtNgA
'''
word_vectors = KeyedVectors.load_word2vec_format('sgns.wiki.word', binary=False)  # 文本格式
# KeyedVectors.save_word2vec_format('sgns.wiki.word', binary=False)  # C text format


print(word_vectors.most_similar('微信', topn=10))
'''
[('朋友圈', 0.7493679523468018), ('12306', 0.7262053489685059), ('网盘', 0.7253696918487549), 
('央视网', 0.7234827876091003), ('道板', 0.7190656661987305), ('跟帖', 0.7162557244300842), 
('账号', 0.7146360874176025), ('人人网', 0.7126163244247437), ('陌陌', 0.7096223831176758), 
('微博客', 0.7089349627494812)]
'''

print(word_vectors.most_similar('无中生有', topn=10))
'''
[('断章取义', 0.8651273250579834), ('无稽', 0.843364953994751), ('扣帽子', 0.8430458307266235), 
('沽名钓誉', 0.8417525291442871), ('怪力乱神', 0.8404598236083984), ('造谣生事', 0.8398465514183044), 
('无的放矢', 0.8393775224685669), ('是非曲直', 0.838774561882019), ('以偏概全', 0.8376606106758118), 
('上纲上线', 0.8358146548271179)]
'''

print(word_vectors.most_similar('国庆', topn=10))
'''
[('双十国庆', 0.6437270641326904), ('国庆节', 0.6245113611221313), ('庆祝国庆', 0.6125393509864807), 
('阅兵典礼', 0.6078155636787415), ('观礼团', 0.6043969988822937), ('光复节', 0.5971723794937134), 
('五一劳动节', 0.5968620777130127), ('阅兵式', 0.591229259967804), ('国际劳动节', 0.5903476476669312), 
('庆祝大会', 0.5899035930633545)]
'''

print(word_vectors.most_similar(positive=['国王', '女'], negative=['男'], topn=10))  # 国王+女-男=王后
'''
[('王后', 0.5802208185195923), ('世都', 0.51966392993927), ('王储', 0.5129014253616333), 
('世和', 0.5113554000854492), ('世之女', 0.5077805519104004), ('世及', 0.5068414211273193), 
('世之弟', 0.5057287812232971), ('普密蓬·阿杜德', 0.5053600668907166), ('世也', 0.5015645027160645), 
('摄政王', 0.4989777207374573)]
'''

'''

训练词向量

'''

from gensim.models import word2vec

with open('corpus_space.txt', 'r') as f:

sentences = f.readlines()

sentences = [s.split() for s in sentences]

# size表示词向量维度 iter表示迭代次数

model = word2vec.Word2Vec(sentences, window=5, min_count=2, size=300, iter=30, workers=3)

#model = Word2Vec.load("word2vec.model")

#model.wv.save('vectors_300d_word2vec') # 保存训练过程

#model.save('vectors_300d_word2vec') # 保存训练过程

model.wv.save_word2vec_format('vectors_300d_word2vec.txt') # 仅保留词向量

'''

近义词

'''

from gensim.models.keyedvectors import KeyedVectors

'''

sgns.wiki.word 是词语和300d向量对应的纯文本格式

下载地址 https://pan.baidu.com/s/1AmXYWVgkxrG4GokevPtNgA

'''

word_vectors = KeyedVectors.load_word2vec_format('sgns.wiki.word', binary=False) # 文本格式

# KeyedVectors.save_word2vec_format('sgns.wiki.word', binary=False) # C text format

print(word_vectors.most_similar('微信', topn=10))

'''

[('朋友圈', 0.7493679523468018), ('12306', 0.7262053489685059), ('网盘', 0.7253696918487549),

('央视网', 0.7234827876091003), ('道板', 0.7190656661987305), ('跟帖', 0.7162557244300842),

('账号', 0.7146360874176025), ('人人网', 0.7126163244247437), ('陌陌', 0.7096223831176758),

('微博客', 0.7089349627494812)]

'''

print(word_vectors.most_similar('无中生有', topn=10))

'''

[('断章取义', 0.8651273250579834), ('无稽', 0.843364953994751), ('扣帽子', 0.8430458307266235),

('沽名钓誉', 0.8417525291442871), ('怪力乱神', 0.8404598236083984), ('造谣生事', 0.8398465514183044),

('无的放矢', 0.8393775224685669), ('是非曲直', 0.838774561882019), ('以偏概全', 0.8376606106758118),

('上纲上线', 0.8358146548271179)]

'''

print(word_vectors.most_similar('国庆', topn=10))

'''

[('双十国庆', 0.6437270641326904), ('国庆节', 0.6245113611221313), ('庆祝国庆', 0.6125393509864807),

('阅兵典礼', 0.6078155636787415), ('观礼团', 0.6043969988822937), ('光复节', 0.5971723794937134),

('五一劳动节', 0.5968620777130127), ('阅兵式', 0.591229259967804), ('国际劳动节', 0.5903476476669312),

('庆祝大会', 0.5899035930633545)]

'''

print(word_vectors.most_similar(positive=['国王', '女'], negative=['男'], topn=10)) # 国王+女-男=王后

'''

[('王后', 0.5802208185195923), ('世都', 0.51966392993927), ('王储', 0.5129014253616333),

('世和', 0.5113554000854492), ('世之女', 0.5077805519104004), ('世及', 0.5068414211273193),

('世之弟', 0.5057287812232971), ('普密蓬·阿杜德', 0.5053600668907166), ('世也', 0.5015645027160645),

('摄政王', 0.4989777207374573)]

'''

参考

python中gensim库详解(word2vec) https://blog.csdn.net/gdh756462786/article/details/79108665/
https://segmentfault.com/a/1190000008173404
Word2Vec文档 https://radimrehurek.com/gensim/models/word2vec.html
300d中文词向量下载　　https://github.com/Embedding/Chinese-Word-Vectors
预训练词向量中文维基百科,英文斯坦福glove预训练的词向量下载　 https://blog.csdn.net/sinat_41144773/article/details/89875130
使用中文维基百科进行GloVe实验 https://blog.csdn.net/kisslotus/article/details/78298436
维基百科训练的中文词向量　https://www.cnblogs.com/Darwin2000/p/5786984.html 　 https://pan.baidu.com/s/1UpZeuqlNMl6XtTB5la53Xw