# -*- coding:utf-8 -*- import config import re import numpy as np def merge_stem(a_stem:list, b_stem:list): """ 合并词根 """ return list(set(a_stem).union(set(b_stem))) def gen_word_vec(a_word:str, b_word:str, stem:list): """ 生成词向量 """ a_vec, b_vec = [], [] for word in stem: # if re.findall(word, config.RE_SPECIAL_SIMBOL): if word in config.RE_SPECIAL_SIMBOL: word = "\\" + word if word == "c++": word = "c\\+\\+" a_vec.append(len(re.findall(word, a_word))) b_vec.append(len(re.findall(word, b_word))) return a_vec, b_vec def col_sim(vec1, vec2): """ 计算余弦相似性 """ return vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) def cal_cos_sim(a_word:str, a_stem:list, b_word:str, b_stem:list): """ 计算余弦相似性 """ union_stem = merge_stem(a_stem, b_stem) a_vec, b_vec = gen_word_vec(a_word, b_word, union_stem) val = col_sim(np.array(a_vec), np.array(b_vec)) return val if __name__ == "__main__": # a_word= "QQ邮箱格式怎么写" # b_word= "QQ邮箱格式如何写" # a_word_root = ['QQ', '邮箱', '格式', '怎么', '写'] # b_word_root = ['QQ', '邮箱', '格式', '如何', '写'] # print(cal_cos_sim(a_word, a_word_root, b_word, b_word_root)) # 合并词根,用于生成词向量 # union_word_root = merge_stem(a_word_root, b_word_root) # print(union_word_root) # # 生成词向量 # a_vec, b_vec = gen_word_vec(a_word, b_word, union_word_root) # print(a_vec) # print(b_vec) # # a_vec = [1,1,1,1,0,1] # # b_vec = [1,1,1,0,1,1] # print(col_sim(np.array(a_vec), np.array(b_vec))) # s = "0,腋下长了一个小疙瘩是什么东西,['腋下', '长', '了', '一个', '小', '疙瘩', '是', '什么', '东西']" # s_r = r"'([^,]*)'" # pattern = re.compile(s_r, re.I) # for i in pattern.findall(s): # print(i) # s_r = r"([\d]*),(.*),\[" # pattern = re.compile(s_r, re.I) # m = pattern.match(s) # for i in m.groups(): # print(i) # import mmap # with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as f_key, \ # mmap.mmap(f_key.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap: # key_info_re = r"([\d]*),(.*),\[" # key_info_pattern = re.compile(key_info_re, re.I) # s_r = r"'([^,]*)'" # s_pattern = re.compile(s_r, re.I) # a_line = f_key_mmap.readline().decode("UTF-8") # b_line = f_key_mmap.readline().decode("UTF-8") # a_m = key_info_pattern.match(a_line) # a_key = a_m.group(2) # a_stem = s_pattern.findall(a_line) # print(a_stem) # b_m = key_info_pattern.match(b_line) # b_key = b_m.group(2) # b_stem = s_pattern.findall(b_line) # print(b_stem) # print(cal_cos_sim(a_key, a_stem, b_key, b_stem)) # a_key = "吃什么东西减肥最快" # a_stem = ['吃', '什么', '东西', '减肥', '最快'] # b_key="vc++读写什么文件最快" # b_stem =['v', 'c++', '读写', '什么', '文件', '最快'] # print(cal_cos_sim(a_key, a_stem, b_key, b_stem)) # print(re.findall("c\\+\\+", "vc++读写什么文件最快")) print("".join([".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"])) # s = r"([.?^$*+\[]|{}()])" s=r"([\\])" re.findall(s, "vc++读写什么文件最快") print(re.findall(s, "vc++读写什么文件\最快"))