analyse.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. # -*- coding:utf-8 -*-
  2. import math
  3. import jieba
  4. import re
  5. import os
  6. import pickle
  7. word_dict = {}
  8. SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
  9. def cut_word(word):
  10. """
  11. 分词
  12. """
  13. word_root = jieba.cut_for_search(word)
  14. return list(word_root)
  15. def merge_word_root(word_root_a, word_root_b):
  16. """
  17. 合并词根
  18. """
  19. return list(set(word_root_a).union(set(word_root_b)))
  20. def gen_word_vector(word_a, word_b, word_root_union):
  21. """
  22. 生成词向量
  23. """
  24. a_word_vector, b_word_vector = [], []
  25. for word in word_root_union:
  26. if word in SPECIAL_SIMBOL :
  27. word = "\\" + word
  28. a_word_vector.append(len(re.findall(word, word_a)))
  29. b_word_vector.append(len(re.findall(word, word_b)))
  30. return a_word_vector, b_word_vector
  31. def vector_multi(a_vector, b_vector):
  32. """
  33. 向量相乘求和
  34. """
  35. return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
  36. def vector_square_sum(word_vector):
  37. """
  38. 向量平方求和
  39. """
  40. sum = 0
  41. for i in word_vector:
  42. sum = sum + i * i
  43. return sum
  44. def vector_cos(v_multi, a_v_ss, b_v_ss):
  45. """
  46. 计算余弦值
  47. """
  48. return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
  49. def cal_cos(a_word, b_word, word_dict):
  50. """
  51. 计算两个长尾关键词的余弦值
  52. """
  53. # a_word_root = cut_word(a_word)
  54. # b_word_root = cut_word(b_word)
  55. a_word_root = word_dict[a_word]
  56. b_word_root = word_dict[b_word]
  57. # 合并词根,用于生成词向量
  58. union_word_root = merge_word_root(a_word_root, b_word_root)
  59. # 生成词向量
  60. a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
  61. # 词向量相乘求和
  62. ab_vector_multi = vector_multi(a_vector, b_vector)
  63. # 向量平方求和
  64. a_vector_squar_sum = vector_square_sum(a_vector)
  65. b_vector_squar_sum = vector_square_sum(b_vector)
  66. cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
  67. return cos_val
  68. def load_word_root_cache():
  69. word_root_cache = {}
  70. if os.path.exists("./data/pkl/word_root_cache.pkl"):
  71. print("存在缓存,开始加载")
  72. with open("./data/pkl/word_root_cache.pkl", "rb") as f:
  73. word_root_cache = pickle.load(f)
  74. return word_root_cache
  75. print('不存在缓存,开始构建分词字典')
  76. with open("./data/分词结果_bak.txt", "r", encoding="UTF-8") as f:
  77. lines = f.readlines()
  78. for line in lines:
  79. index = line.index(",")
  80. word_root_cache[line[:index]] = line[index+1:]
  81. print("构建完成,保存到本地")
  82. with open("./data/pkl/word_root_cache.pkl", "wb") as f:
  83. pickle.dump(word_root_cache, f)
  84. return word_root_cache
  85. word_dict = load_word_root_cache()
  86. key_list = list(word_dict.keys())
  87. for i, a_key in enumerate(key_list[:-1]):
  88. with open("./data/category/%s.txt" % a_key, "w", encoding="UTF-8") as f:
  89. f.write(a_key)
  90. f.write("\n\n")
  91. del_container = []
  92. for j, b_key in enumerate(key_list[i+1:]):
  93. if j % 100000 == 0 :
  94. print("正在处理:%d, %d" % (i, j))
  95. cos_val = cal_cos(a_key, b_key, word_dict)
  96. if cos_val > 0.8 :
  97. print("%s 与 %s 的余弦值:%f " % (a_key, b_key, cos_val))
  98. f.write(b_key)
  99. f.write("\n")
  100. key_list.remove(a_key)
  101. if len(del_container) > 0:
  102. print("删除已处理的元素")
  103. for item in del_container:
  104. key_list.remove(item)
  105. # a_word = "腋下长了一个小疙瘩是什么东西"
  106. # b_word = "什么东西吃蟑螂(四个字)"
  107. # cos_val = cal_cos(a_word, b_word)
  108. # print(cos_val)
  109. # print("的余弦值:%f " % ( cos_val))
  110. # print(cut_word(b_word))
  111. # 1. 使用缓存中的分词
  112. # 2. 余弦值超过0.8的视为一组