|
@@ -0,0 +1,146 @@
|
|
|
|
|
+# -*- coding:utf-8 -*-
|
|
|
|
|
+
|
|
|
|
|
+import os
|
|
|
|
|
+import re
|
|
|
|
|
+import pickle
|
|
|
|
|
+import math
|
|
|
|
|
+from multiprocessing import Manager, Process
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 分词结果
|
|
|
|
|
+CUT_WORD_RESULT = "./data/分词结果_bak.txt"
|
|
|
|
|
+
|
|
|
|
|
+# 分词缓存
|
|
|
|
|
+CUT_WORD_CACHE = "./data/pkl/word_root_cache.pkl"
|
|
|
|
|
+
|
|
|
|
|
+# 分析保存结果
|
|
|
|
|
+ANALYSE_OUTPUT_FILE = "./data/category/%s.txt"
|
|
|
|
|
+
|
|
|
|
|
+# 正则表达式中的特殊符号
|
|
|
|
|
+SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
|
|
|
|
|
+
|
|
|
|
|
+def merge_word_root(word_root_a, word_root_b):
|
|
|
|
|
+ """
|
|
|
|
|
+ 合并词根
|
|
|
|
|
+ """
|
|
|
|
|
+ return list(set(word_root_a).union(set(word_root_b)))
|
|
|
|
|
+
|
|
|
|
|
+def gen_word_vector(word_a, word_b, word_root_union):
|
|
|
|
|
+ """
|
|
|
|
|
+ 生成词向量
|
|
|
|
|
+ """
|
|
|
|
|
+ a_word_vector, b_word_vector = [], []
|
|
|
|
|
+ for word in word_root_union:
|
|
|
|
|
+ if word in SPECIAL_SIMBOL :
|
|
|
|
|
+ word = "\\" + word
|
|
|
|
|
+ a_word_vector.append(len(re.findall(word, word_a)))
|
|
|
|
|
+ b_word_vector.append(len(re.findall(word, word_b)))
|
|
|
|
|
+ return a_word_vector, b_word_vector
|
|
|
|
|
+
|
|
|
|
|
+def vector_multi(a_vector, b_vector):
|
|
|
|
|
+ """
|
|
|
|
|
+ 向量相乘求和
|
|
|
|
|
+ """
|
|
|
|
|
+ return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
|
|
|
|
|
+
|
|
|
|
|
+def vector_square_sum(word_vector):
|
|
|
|
|
+ """
|
|
|
|
|
+ 向量平方求和
|
|
|
|
|
+ """
|
|
|
|
|
+ sum = 0
|
|
|
|
|
+ for i in word_vector:
|
|
|
|
|
+ sum = sum + i * i
|
|
|
|
|
+ return sum
|
|
|
|
|
+
|
|
|
|
|
+def vector_cos(v_multi, a_v_ss, b_v_ss):
|
|
|
|
|
+ """
|
|
|
|
|
+ 计算余弦值
|
|
|
|
|
+ """
|
|
|
|
|
+ return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
|
|
|
|
|
+
|
|
|
|
|
+def cal_cos(a_word, b_word, word_dict):
|
|
|
|
|
+ """
|
|
|
|
|
+ 计算两个长尾关键词的余弦值
|
|
|
|
|
+ """
|
|
|
|
|
+ a_word_root = word_dict[a_word]
|
|
|
|
|
+ b_word_root = word_dict[b_word]
|
|
|
|
|
+
|
|
|
|
|
+ # 合并词根,用于生成词向量
|
|
|
|
|
+ union_word_root = merge_word_root(a_word_root, b_word_root)
|
|
|
|
|
+
|
|
|
|
|
+ # 生成词向量
|
|
|
|
|
+ a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
|
|
|
|
|
+
|
|
|
|
|
+ # 词向量相乘求和
|
|
|
|
|
+ ab_vector_multi = vector_multi(a_vector, b_vector)
|
|
|
|
|
+
|
|
|
|
|
+ # 向量平方求和
|
|
|
|
|
+ a_vector_squar_sum = vector_square_sum(a_vector)
|
|
|
|
|
+ b_vector_squar_sum = vector_square_sum(b_vector)
|
|
|
|
|
+
|
|
|
|
|
+ cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
|
|
|
|
|
+
|
|
|
|
|
+ return cos_val
|
|
|
|
|
+
|
|
|
|
|
+def process(global_word_root, global_del_cache, a_key, keys):
|
|
|
|
|
+ container = []
|
|
|
|
|
+ total_num = len(keys)
|
|
|
|
|
+ for j, b_key in enumerate(keys):
|
|
|
|
|
+ if j % 100000 == 0 :
|
|
|
|
|
+ print("处理进度:%d / %d" % (j, total_num))
|
|
|
|
|
+ cos_val = cal_cos(a_key, b_key, global_word_root)
|
|
|
|
|
+ if cos_val > 0.8 and b_key not in global_del_cache :
|
|
|
|
|
+ print("%s 与 %s 的余弦值:%f " % (a_key, b_key, cos_val))
|
|
|
|
|
+ container.append(b_key)
|
|
|
|
|
+ global_del_cache.append(b_key)
|
|
|
|
|
+
|
|
|
|
|
+ with open(ANALYSE_OUTPUT_FILE % a_key, "w", encoding="UTF-8") as f:
|
|
|
|
|
+ f.write(a_key)
|
|
|
|
|
+ f.write("\n")
|
|
|
|
|
+ for b_key in container:
|
|
|
|
|
+ f.write(b_key)
|
|
|
|
|
+ f.write("\n")
|
|
|
|
|
+
|
|
|
|
|
+def load_word_root_cache():
|
|
|
|
|
+ """
|
|
|
|
|
+ 加载分词缓存
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ word_root_cache = {}
|
|
|
|
|
+
|
|
|
|
|
+ if os.path.exists(CUT_WORD_CACHE):
|
|
|
|
|
+ print("存在缓存,开始加载")
|
|
|
|
|
+ with open(CUT_WORD_CACHE, "rb") as f:
|
|
|
|
|
+ word_root_cache = pickle.load(f)
|
|
|
|
|
+ return word_root_cache
|
|
|
|
|
+
|
|
|
|
|
+ print('不存在缓存,开始构建分词字典')
|
|
|
|
|
+ with open(CUT_WORD_RESULT, "r", encoding="UTF-8") as f:
|
|
|
|
|
+ lines = f.readlines()
|
|
|
|
|
+ for line in lines:
|
|
|
|
|
+ index = line.index(",")
|
|
|
|
|
+ word_root_cache[line[:index]] = line[index+1:]
|
|
|
|
|
+
|
|
|
|
|
+ print("构建完成,保存到本地")
|
|
|
|
|
+ with open(CUT_WORD_CACHE, "wb") as f:
|
|
|
|
|
+ pickle.dump(word_root_cache, f)
|
|
|
|
|
+
|
|
|
|
|
+ return word_root_cache
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def main():
|
|
|
|
|
+
|
|
|
|
|
+ word_root_cache = load_word_root_cache();
|
|
|
|
|
+
|
|
|
|
|
+ keys = [key for key in word_root_cache.keys()]
|
|
|
|
|
+
|
|
|
|
|
+ manager = Manager()
|
|
|
|
|
+ global_word_root = manager.dict(word_root_cache)
|
|
|
|
|
+ global_del_cache = manager.list()
|
|
|
|
|
+
|
|
|
|
|
+ p = Process(target=process, args=(global_word_root, global_del_cache, keys[0], keys[1:]))
|
|
|
|
|
+ p.join()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ main()
|