| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- # -*- coding:utf-8 -*-
- import config
- import re
- import numpy as np
- def merge_stem(a_stem:list, b_stem:list):
- """
- 合并词根
- """
- return list(set(a_stem).union(set(b_stem)))
- def gen_word_vec(a_word:str, b_word:str, stem:list):
- """
- 生成词向量
- """
- a_vec, b_vec = [], []
- for word in stem:
- # if re.findall(word, config.RE_SPECIAL_SIMBOL):
- if word in config.RE_SPECIAL_SIMBOL:
- word = "\\" + word
- if word == "c++":
- word = "c\\+\\+"
- a_vec.append(len(re.findall(word, a_word)))
- b_vec.append(len(re.findall(word, b_word)))
- return a_vec, b_vec
- def col_sim(vec1, vec2):
- """
- 计算余弦相似性
- """
- return vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
- def cal_cos_sim(a_word:str, a_stem:list, b_word:str, b_stem:list):
- """
- 计算余弦相似性
- """
- union_stem = merge_stem(a_stem, b_stem)
- a_vec, b_vec = gen_word_vec(a_word, b_word, union_stem)
- val = col_sim(np.array(a_vec), np.array(b_vec))
- return val
- if __name__ == "__main__":
- # a_word= "QQ邮箱格式怎么写"
- # b_word= "QQ邮箱格式如何写"
- # a_word_root = ['QQ', '邮箱', '格式', '怎么', '写']
- # b_word_root = ['QQ', '邮箱', '格式', '如何', '写']
- # print(cal_cos_sim(a_word, a_word_root, b_word, b_word_root))
- # 合并词根,用于生成词向量
- # union_word_root = merge_stem(a_word_root, b_word_root)
- # print(union_word_root)
- # # 生成词向量
- # a_vec, b_vec = gen_word_vec(a_word, b_word, union_word_root)
- # print(a_vec)
- # print(b_vec)
- # # a_vec = [1,1,1,1,0,1]
- # # b_vec = [1,1,1,0,1,1]
- # print(col_sim(np.array(a_vec), np.array(b_vec)))
- # s = "0,腋下长了一个小疙瘩是什么东西,['腋下', '长', '了', '一个', '小', '疙瘩', '是', '什么', '东西']"
- # s_r = r"'([^,]*)'"
- # pattern = re.compile(s_r, re.I)
- # for i in pattern.findall(s):
- # print(i)
-
- # s_r = r"([\d]*),(.*),\["
- # pattern = re.compile(s_r, re.I)
- # m = pattern.match(s)
- # for i in m.groups():
- # print(i)
- # import mmap
- # with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as f_key, \
- # mmap.mmap(f_key.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap:
- # key_info_re = r"([\d]*),(.*),\["
- # key_info_pattern = re.compile(key_info_re, re.I)
- # s_r = r"'([^,]*)'"
- # s_pattern = re.compile(s_r, re.I)
- # a_line = f_key_mmap.readline().decode("UTF-8")
- # b_line = f_key_mmap.readline().decode("UTF-8")
- # a_m = key_info_pattern.match(a_line)
- # a_key = a_m.group(2)
- # a_stem = s_pattern.findall(a_line)
- # print(a_stem)
- # b_m = key_info_pattern.match(b_line)
- # b_key = b_m.group(2)
- # b_stem = s_pattern.findall(b_line)
- # print(b_stem)
- # print(cal_cos_sim(a_key, a_stem, b_key, b_stem))
- # a_key = "吃什么东西减肥最快"
- # a_stem = ['吃', '什么', '东西', '减肥', '最快']
- # b_key="vc++读写什么文件最快"
- # b_stem =['v', 'c++', '读写', '什么', '文件', '最快']
- # print(cal_cos_sim(a_key, a_stem, b_key, b_stem))
- # print(re.findall("c\\+\\+", "vc++读写什么文件最快"))
-
-
- print("".join([".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]))
- # s = r"([.?^$*+\[]|{}()])"
- s=r"([\\])"
- re.findall(s, "vc++读写什么文件最快")
- print(re.findall(s, "vc++读写什么文件\最快"))
|