analyse.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # -*- coding:utf-8 -*-
  2. import math
  3. import jieba
  4. import re
  5. word_dict = {}
  6. SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
  7. def cut_word(word):
  8. """
  9. 分词
  10. """
  11. word_root = jieba.cut_for_search(word)
  12. return list(word_root)
  13. def merge_word_root(word_root_a, word_root_b):
  14. """
  15. 合并词根
  16. """
  17. return list(set(word_root_a).union(set(word_root_b)))
  18. def gen_word_vector(word_a, word_b, word_root_union):
  19. """
  20. 生成词向量
  21. """
  22. a_word_vector, b_word_vector = [], []
  23. for word in word_root_union:
  24. if word in SPECIAL_SIMBOL :
  25. word = "\\" + word
  26. a_word_vector.append(len(re.findall(word, word_a)))
  27. b_word_vector.append(len(re.findall(word, word_b)))
  28. return a_word_vector, b_word_vector
  29. def vector_multi(a_vector, b_vector):
  30. """
  31. 向量相乘求和
  32. """
  33. return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
  34. def vector_square_sum(word_vector):
  35. """
  36. 向量平方求和
  37. """
  38. sum = 0
  39. for i in word_vector:
  40. sum = sum + i * i
  41. return sum
  42. def vector_cos(v_multi, a_v_ss, b_v_ss):
  43. """
  44. 计算余弦值
  45. """
  46. return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
  47. def cal_cos(a_word, b_word):
  48. """
  49. 计算两个长尾关键词的余弦值
  50. """
  51. a_word_root = cut_word(a_word)
  52. b_word_root = cut_word(b_word)
  53. # a_word_root = word_dict[a_word]
  54. # b_word_root = word_dict[b_word]
  55. # 合并词根,用于生成词向量
  56. union_word_root = merge_word_root(a_word_root, b_word_root)
  57. # 生成词向量
  58. a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
  59. # 词向量相乘求和
  60. ab_vector_multi = vector_multi(a_vector, b_vector)
  61. # 向量平方求和
  62. a_vector_squar_sum = vector_square_sum(a_vector)
  63. b_vector_squar_sum = vector_square_sum(b_vector)
  64. cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
  65. return cos_val
  66. # with open(DATA_FILE, "r", encoding="UTF-8") as f:
  67. # lines = f.readlines()
  68. # for line in lines[:1000000]:
  69. # line = line.replace("\n", "")
  70. # word_root = cut_word(line)
  71. # word_dict[line]=word_root
  72. # key_list = list(word_dict.keys())
  73. # for i, a_key in enumerate(key_list[:-1]):
  74. # for j, b_key in enumerate(key_list[i+1:]):
  75. # if j % 100000 == 0 :
  76. # print("正在处理:%d, %d" % (i, j))
  77. # cos_val = cal_cos(a_key, b_key)
  78. # if cos_val > 0 :
  79. # print("%s 与 %s 的余弦值:%d " % (a_key, b_key, cos_val))
  80. # a_word = "腋下长了一个小疙瘩是什么东西"
  81. # b_word = "什么东西吃蟑螂(四个字)"
  82. # cos_val = cal_cos(a_word, b_word)
  83. # print(cos_val)
  84. # print("的余弦值:%f " % ( cos_val))
  85. # print(cut_word(b_word))