cal.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. # -*- coding:utf-8 -*-
  2. import config
  3. import re
  4. import math
  5. def merge_word_root(word_root_a, word_root_b):
  6. """
  7. 合并词根
  8. """
  9. return list(set(word_root_a).union(set(word_root_b)))
  10. def gen_word_vector(word_a, word_b, word_root_union):
  11. """
  12. 生成词向量
  13. """
  14. a_word_vector, b_word_vector = [], []
  15. for word in word_root_union:
  16. if word in config.RE_SPECIAL_SIMBOL :
  17. word = "\\" + word
  18. a_word_vector.append(len(re.findall(word, word_a)))
  19. b_word_vector.append(len(re.findall(word, word_b)))
  20. return a_word_vector, b_word_vector
  21. def vector_multi(a_vector, b_vector):
  22. """
  23. 向量相乘求和
  24. """
  25. return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
  26. def vector_square_sum(word_vector):
  27. """
  28. 向量平方求和
  29. """
  30. sum = 0
  31. for i in word_vector:
  32. sum = sum + i * i
  33. return sum
  34. def vector_cos(v_multi, a_v_ss, b_v_ss):
  35. """
  36. 计算余弦值
  37. """
  38. return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
  39. def cal_cos(a_word, b_word, a_word_root, b_word_root):
  40. """
  41. 计算两个长尾关键词的余弦值
  42. """
  43. # 合并词根,用于生成词向量
  44. union_word_root = merge_word_root(a_word_root, b_word_root)
  45. # 生成词向量
  46. a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
  47. # 词向量相乘求和
  48. ab_vector_multi = vector_multi(a_vector, b_vector)
  49. # 向量平方求和
  50. a_vector_squar_sum = vector_square_sum(a_vector)
  51. b_vector_squar_sum = vector_square_sum(b_vector)
  52. cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
  53. return cos_val
  54. if __name__ == "__main__":
  55. a_word = "腋下长了一个小疙瘩是什么东西"
  56. b_word = "白凉粉是什么东西"
  57. a_word_root = ['腋下', '长', '了', '一个', '小', '疙瘩', '是', '什么', '东西']
  58. b_word_root = ['白', '凉粉', '是', '什么', '东西']
  59. print(cal_cos(a_word, b_word, a_word_root, b_word_root))