| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- # -*- coding:utf-8 -*-
- import os
- import re
- import pickle
- import math
- from multiprocessing import Manager, Process
- # 分词结果
- CUT_WORD_RESULT = "./data/分词结果_bak.txt"
- # 分词缓存
- CUT_WORD_CACHE = "./data/pkl/word_root_cache.pkl"
- # 分析保存结果
- ANALYSE_OUTPUT_FILE = "./data/category/%s.txt"
- # 正则表达式中的特殊符号
- SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
- def merge_word_root(word_root_a, word_root_b):
- """
- 合并词根
- """
- return list(set(word_root_a).union(set(word_root_b)))
- def gen_word_vector(word_a, word_b, word_root_union):
- """
- 生成词向量
- """
- a_word_vector, b_word_vector = [], []
- for word in word_root_union:
- if word in SPECIAL_SIMBOL :
- word = "\\" + word
- a_word_vector.append(len(re.findall(word, word_a)))
- b_word_vector.append(len(re.findall(word, word_b)))
- return a_word_vector, b_word_vector
- def vector_multi(a_vector, b_vector):
- """
- 向量相乘求和
- """
- return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
- def vector_square_sum(word_vector):
- """
- 向量平方求和
- """
- sum = 0
- for i in word_vector:
- sum = sum + i * i
- return sum
- def vector_cos(v_multi, a_v_ss, b_v_ss):
- """
- 计算余弦值
- """
- return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
- def cal_cos(a_word, b_word, word_dict):
- """
- 计算两个长尾关键词的余弦值
- """
- a_word_root = word_dict[a_word]
- b_word_root = word_dict[b_word]
- # 合并词根,用于生成词向量
- union_word_root = merge_word_root(a_word_root, b_word_root)
- # 生成词向量
- a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
- # 词向量相乘求和
- ab_vector_multi = vector_multi(a_vector, b_vector)
- # 向量平方求和
- a_vector_squar_sum = vector_square_sum(a_vector)
- b_vector_squar_sum = vector_square_sum(b_vector)
- cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
- return cos_val
- def process(global_word_root, global_del_cache, a_key, keys):
- container = []
- total_num = len(keys)
- for j, b_key in enumerate(keys):
- if j % 100000 == 0 :
- print("处理进度:%d / %d" % (j, total_num))
- cos_val = cal_cos(a_key, b_key, global_word_root)
- if cos_val > 0.8 and b_key not in global_del_cache :
- print("%s 与 %s 的余弦值:%f " % (a_key, b_key, cos_val))
- container.append(b_key)
- global_del_cache.append(b_key)
- with open(ANALYSE_OUTPUT_FILE % a_key, "w", encoding="UTF-8") as f:
- f.write(a_key)
- f.write("\n")
- for b_key in container:
- f.write(b_key)
- f.write("\n")
- def load_word_root_cache():
- """
- 加载分词缓存
- """
- word_root_cache = {}
- if os.path.exists(CUT_WORD_CACHE):
- print("存在缓存,开始加载")
- with open(CUT_WORD_CACHE, "rb") as f:
- word_root_cache = pickle.load(f)
- return word_root_cache
- print('不存在缓存,开始构建分词字典')
- with open(CUT_WORD_RESULT, "r", encoding="UTF-8") as f:
- lines = f.readlines()
- for line in lines:
- index = line.index(",")
- word_root_cache[line[:index]] = line[index+1:]
-
- print("构建完成,保存到本地")
- with open(CUT_WORD_CACHE, "wb") as f:
- pickle.dump(word_root_cache, f)
-
- return word_root_cache
- def main():
- word_root_cache = load_word_root_cache();
- keys = [key for key in word_root_cache.keys()]
- manager = Manager()
- global_word_root = manager.dict(word_root_cache)
- global_del_cache = manager.list()
- p = Process(target=process, args=(global_word_root, global_del_cache, keys[0], keys[1:]))
- p.join()
- if __name__ == "__main__":
- main()
|