key_reverse.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # -*- coding:utf-8 -*-
  2. import sys
  3. from time import time
  4. import os
  5. import config
  6. import tools
  7. import ast
  8. import re
  9. import stop_word
  10. TITLE = "关键词倒排文件"
  11. def main():
  12. """
  13. 构建待排表
  14. """
  15. tools.init_log()
  16. tools.log_start_msg(TITLE)
  17. # 提取规则
  18. s = r"(\d+),([^,]*),(.*)"
  19. pattern = re.compile(s, re.I)
  20. # 倒排表 容器
  21. key_reverse = {}
  22. # 停用表
  23. stop_word_cache = stop_word.load_stop_word()
  24. with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey:
  25. # 获取文件总大小,获取后需要复原光标位置
  26. fkey.seek(0, os.SEEK_END)
  27. total_num = fkey.tell()
  28. fkey.seek(0)
  29. while True:
  30. # 获取当前处理位置
  31. cur_pos = fkey.tell()
  32. # 进度提示
  33. tools.tip_in_size(total_num, cur_pos)
  34. # 读取关键词数据
  35. line = fkey.readline()
  36. # 如果到行尾则结束
  37. if not line:
  38. break
  39. # 提取数据
  40. m = pattern.match(line)
  41. # 获取关键词序号
  42. index = m.group(1)
  43. # 获取词根
  44. key_root = m.group(3)
  45. # 转换成真正的list对象
  46. for item in ast.literal_eval(key_root):
  47. # 排除停用词
  48. if item in stop_word_cache:
  49. continue
  50. # 构建倒排表
  51. val = key_reverse.get(item)
  52. if val:
  53. key_reverse[item].append(index)
  54. else:
  55. key_reverse[item]=[]
  56. key_reverse[item].append(index)
  57. # 保存到本地文件
  58. with open(config.KEY_REVERSE_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
  59. for key, value in key_reverse.items():
  60. f.write("%s,%s\n" % (key, value))
  61. tools.log_end_msg(TITLE)
  62. if __name__ == "__main__":
  63. main()
  64. # 测试加载耗时
  65. # start = time()
  66. # key_reverse_cache = tools.load_obj(config.KEY_REVERSE_CACHE)
  67. # end = time()
  68. # print("占用大小:", sys.getsizeof(key_reverse_cache))
  69. # print("加载耗时:", end-start)