| 12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- # -*- coding:utf-8 -*-
- import config
- import re
- import numpy as np
- def merge_stem(a_stem:list, b_stem:list):
- """
- 合并词根
- """
- return list(set(a_stem).union(set(b_stem)))
- def gen_word_vec(a_word:str, b_word:str, stem:list):
- """
- 生成词向量
- """
- a_vec, b_vec = [], []
- for word in stem:
- # if re.findall(word, config.RE_SPECIAL_SIMBOL):
- if word in config.RE_SPECIAL_SIMBOL:
- word = "\\" + word
- if word == "c++":
- word = "c\\+\\+"
- a_vec.append(len(re.findall(word, a_word)))
- b_vec.append(len(re.findall(word, b_word)))
- return a_vec, b_vec
- def col_sim(vec1, vec2):
- """
- 计算余弦相似性
- """
- return vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
- def cal_cos_sim(a_word:str, a_stem:list, b_word:str, b_stem:list):
- """
- 计算余弦相似性
- """
- union_stem = merge_stem(a_stem, b_stem)
- a_vec, b_vec = gen_word_vec(a_word, b_word, union_stem)
- val = col_sim(np.array(a_vec), np.array(b_vec))
- return val
|