cal.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # -*- coding:utf-8 -*-
  2. import config
  3. import re
  4. import numpy as np
  5. def merge_stem(a_stem:list, b_stem:list):
  6. """
  7. 合并词根
  8. """
  9. return list(set(a_stem).union(set(b_stem)))
  10. def gen_word_vec(a_word:str, b_word:str, stem:list):
  11. """
  12. 生成词向量
  13. """
  14. a_vec, b_vec = [], []
  15. for word in stem:
  16. # if re.findall(word, config.RE_SPECIAL_SIMBOL):
  17. if word in config.RE_SPECIAL_SIMBOL:
  18. word = "\\" + word
  19. if word == "c++":
  20. word = "c\\+\\+"
  21. a_vec.append(len(re.findall(word, a_word)))
  22. b_vec.append(len(re.findall(word, b_word)))
  23. return a_vec, b_vec
  24. def col_sim(vec1, vec2):
  25. """
  26. 计算余弦相似性
  27. """
  28. return vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
  29. def cal_cos_sim(a_word:str, a_stem:list, b_word:str, b_stem:list):
  30. """
  31. 计算余弦相似性
  32. """
  33. union_stem = merge_stem(a_stem, b_stem)
  34. a_vec, b_vec = gen_word_vec(a_word, b_word, union_stem)
  35. val = col_sim(np.array(a_vec), np.array(b_vec))
  36. return val
  37. if __name__ == "__main__":
  38. # a_word= "QQ邮箱格式怎么写"
  39. # b_word= "QQ邮箱格式如何写"
  40. # a_word_root = ['QQ', '邮箱', '格式', '怎么', '写']
  41. # b_word_root = ['QQ', '邮箱', '格式', '如何', '写']
  42. # print(cal_cos_sim(a_word, a_word_root, b_word, b_word_root))
  43. # 合并词根,用于生成词向量
  44. # union_word_root = merge_stem(a_word_root, b_word_root)
  45. # print(union_word_root)
  46. # # 生成词向量
  47. # a_vec, b_vec = gen_word_vec(a_word, b_word, union_word_root)
  48. # print(a_vec)
  49. # print(b_vec)
  50. # # a_vec = [1,1,1,1,0,1]
  51. # # b_vec = [1,1,1,0,1,1]
  52. # print(col_sim(np.array(a_vec), np.array(b_vec)))
  53. # s = "0,腋下长了一个小疙瘩是什么东西,['腋下', '长', '了', '一个', '小', '疙瘩', '是', '什么', '东西']"
  54. # s_r = r"'([^,]*)'"
  55. # pattern = re.compile(s_r, re.I)
  56. # for i in pattern.findall(s):
  57. # print(i)
  58. # s_r = r"([\d]*),(.*),\["
  59. # pattern = re.compile(s_r, re.I)
  60. # m = pattern.match(s)
  61. # for i in m.groups():
  62. # print(i)
  63. # import mmap
  64. # with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as f_key, \
  65. # mmap.mmap(f_key.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap:
  66. # key_info_re = r"([\d]*),(.*),\["
  67. # key_info_pattern = re.compile(key_info_re, re.I)
  68. # s_r = r"'([^,]*)'"
  69. # s_pattern = re.compile(s_r, re.I)
  70. # a_line = f_key_mmap.readline().decode("UTF-8")
  71. # b_line = f_key_mmap.readline().decode("UTF-8")
  72. # a_m = key_info_pattern.match(a_line)
  73. # a_key = a_m.group(2)
  74. # a_stem = s_pattern.findall(a_line)
  75. # print(a_stem)
  76. # b_m = key_info_pattern.match(b_line)
  77. # b_key = b_m.group(2)
  78. # b_stem = s_pattern.findall(b_line)
  79. # print(b_stem)
  80. # print(cal_cos_sim(a_key, a_stem, b_key, b_stem))
  81. # a_key = "吃什么东西减肥最快"
  82. # a_stem = ['吃', '什么', '东西', '减肥', '最快']
  83. # b_key="vc++读写什么文件最快"
  84. # b_stem =['v', 'c++', '读写', '什么', '文件', '最快']
  85. # print(cal_cos_sim(a_key, a_stem, b_key, b_stem))
  86. # print(re.findall("c\\+\\+", "vc++读写什么文件最快"))
  87. print("".join([".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]))
  88. # s = r"([.?^$*+\[]|{}()])"
  89. s=r"([\\])"
  90. re.findall(s, "vc++读写什么文件最快")
  91. print(re.findall(s, "vc++读写什么文件\最快"))