cal.py 1.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # -*- coding:utf-8 -*-
  2. import config
  3. import re
  4. import numpy as np
  5. def merge_stem(a_stem:list, b_stem:list):
  6. """
  7. 合并词根
  8. """
  9. return list(set(a_stem).union(set(b_stem)))
  10. def gen_word_vec(a_word:str, b_word:str, stem:list):
  11. """
  12. 生成词向量
  13. """
  14. a_vec, b_vec = [], []
  15. for word in stem:
  16. # if re.findall(word, config.RE_SPECIAL_SIMBOL):
  17. if word in config.RE_SPECIAL_SIMBOL:
  18. word = "\\" + word
  19. if word == "c++":
  20. word = "c\\+\\+"
  21. a_vec.append(len(re.findall(word, a_word)))
  22. b_vec.append(len(re.findall(word, b_word)))
  23. return a_vec, b_vec
  24. def col_sim(vec1, vec2):
  25. """
  26. 计算余弦相似性
  27. """
  28. return vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
  29. def cal_cos_sim(a_word:str, a_stem:list, b_word:str, b_stem:list):
  30. """
  31. 计算余弦相似性
  32. """
  33. union_stem = merge_stem(a_stem, b_stem)
  34. a_vec, b_vec = gen_word_vec(a_word, b_word, union_stem)
  35. val = col_sim(np.array(a_vec), np.array(b_vec))
  36. return val