# -*- coding:utf-8 -*- import math import jieba import re word_dict = {} SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"] def cut_word(word): """ 分词 """ word_root = jieba.cut_for_search(word) return list(word_root) def merge_word_root(word_root_a, word_root_b): """ 合并词根 """ return list(set(word_root_a).union(set(word_root_b))) def gen_word_vector(word_a, word_b, word_root_union): """ 生成词向量 """ a_word_vector, b_word_vector = [], [] for word in word_root_union: if word in SPECIAL_SIMBOL : word = "\\" + word a_word_vector.append(len(re.findall(word, word_a))) b_word_vector.append(len(re.findall(word, word_b))) return a_word_vector, b_word_vector def vector_multi(a_vector, b_vector): """ 向量相乘求和 """ return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector))) def vector_square_sum(word_vector): """ 向量平方求和 """ sum = 0 for i in word_vector: sum = sum + i * i return sum def vector_cos(v_multi, a_v_ss, b_v_ss): """ 计算余弦值 """ return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss)) def cal_cos(a_word, b_word): """ 计算两个长尾关键词的余弦值 """ a_word_root = cut_word(a_word) b_word_root = cut_word(b_word) # a_word_root = word_dict[a_word] # b_word_root = word_dict[b_word] # 合并词根,用于生成词向量 union_word_root = merge_word_root(a_word_root, b_word_root) # 生成词向量 a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root) # 词向量相乘求和 ab_vector_multi = vector_multi(a_vector, b_vector) # 向量平方求和 a_vector_squar_sum = vector_square_sum(a_vector) b_vector_squar_sum = vector_square_sum(b_vector) cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum) return cos_val # with open(DATA_FILE, "r", encoding="UTF-8") as f: # lines = f.readlines() # for line in lines[:1000000]: # line = line.replace("\n", "") # word_root = cut_word(line) # word_dict[line]=word_root # key_list = list(word_dict.keys()) # for i, a_key in enumerate(key_list[:-1]): # for j, b_key in enumerate(key_list[i+1:]): # if j % 100000 == 0 : # print("正在处理:%d, %d" % (i, j)) # cos_val = cal_cos(a_key, b_key) # if cos_val > 0 : # print("%s 与 %s 的余弦值:%d " % (a_key, b_key, cos_val)) # a_word = "腋下长了一个小疙瘩是什么东西" # b_word = "什么东西吃蟑螂(四个字)" # cos_val = cal_cos(a_word, b_word) # print(cos_val) # print("的余弦值:%f " % ( cos_val)) # print(cut_word(b_word))