agg_word.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. # -*- coding:utf-8 -*-
  2. import mmap
  3. import config
  4. import tools
  5. import stop_word
  6. import re
  7. import ast
  8. import cal
  9. import logging
  10. import ast
  11. from bitmap import BitMap
  12. TITLE = "聚合文件"
  13. def re_extract_key(pattern, line):
  14. """
  15. 正则提取关键词信息
  16. """
  17. m = pattern.match(line)
  18. # 关键词 序号
  19. index = m.group(1)
  20. # 关键词
  21. key = m.group(2)
  22. # 关键词 分词词根
  23. word_root = m.group(3)
  24. # 把index转换成数字方便使用
  25. return int(index), key, word_root
  26. def main():
  27. # 初始化日志配置
  28. tools.init_log()
  29. tools.log_start_msg(TITLE)
  30. # 停用词
  31. logging.info("加载停用词")
  32. stop_word_cache = stop_word.load_stop_word()
  33. # 关键词索引
  34. logging.info("加载关键词索引")
  35. key_index_cache = tools.load_obj(config.KEY_INDEX_CACHE)
  36. # 倒排索引
  37. logging.info("加载倒排索引")
  38. key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
  39. # 正则 提取数据
  40. s = r"(\d+),([^,]*),(.*)"
  41. pattern = re.compile(s, re.I)
  42. with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
  43. open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
  44. mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap, \
  45. mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as f_reverse_mmap:
  46. # 计算总关键词数
  47. # TODO 这里要改成从统计信息中获取
  48. total_count = 14500029
  49. # 生成位图bitmap
  50. bm = BitMap(total_count)
  51. # 待处理的文件总大小
  52. total_num = f_key_mmap.size()
  53. while True:
  54. # 当前处理位置
  55. cur_pos = f_key_mmap.tell()
  56. # 进度提示
  57. tools.tip_in_size(total_num, cur_pos)
  58. # 获取要处理的关键词
  59. line = f_key_mmap.readline().decode(config.ENCODING_CHARSET)
  60. # 如果没有任何内容则结束
  61. if not line:
  62. logging.info("发现空白line")
  63. break
  64. # 提取信息
  65. index, key, word_root = re_extract_key(pattern, line)
  66. # bitmap校验,如果已经处理过则跳过
  67. if bm.test(index):
  68. logging.debug("主关键词:%s 已处理,跳过" % key)
  69. continue
  70. # 通过bitmap校验,设置对应的bit为0
  71. bm.set(index)
  72. # 聚合结果存放容器
  73. agg_cache = []
  74. # 记录主要关键词
  75. agg_cache.append(key)
  76. # 转换成真正的list对象
  77. logging.debug("当前处理的主关键词:%s, 词根数量:%d" % (key, len(word_root)))
  78. for item in ast.literal_eval(word_root):
  79. # 排除停用词
  80. if item in stop_word_cache:
  81. continue
  82. # 根据倒排索引,获取相关的关键词序号
  83. other_key_pos = key_reverse_index_cache.get(item)
  84. f_reverse_mmap.seek(other_key_pos)
  85. other_key_line = f_reverse_mmap.readline().decode(config.ENCODING_CHARSET)
  86. # 截取关键词索引部分
  87. other_index = other_key_line.index(",")
  88. other_key_indexs = other_key_line[other_index+1:]
  89. # 转换成真正的list对象
  90. other_key_indexs = ast.literal_eval(other_key_indexs)
  91. if not other_key_indexs:
  92. continue
  93. logging.debug("词根:%s, 涉及的其它关键词数量:%d" % (item, len(other_key_indexs)))
  94. for other_key_index in other_key_indexs:
  95. # bitmap校验,如果已经处理过则跳过
  96. if bm.test(int(other_key_index)):
  97. logging.debug("待比较关键词:%s 已处理,跳过" % other_key_index)
  98. continue
  99. # 从关键词索引中获取关键词位置
  100. pos = key_index_cache[other_key_index]
  101. # 获取待比较的关键词
  102. f_key_mmap.seek(pos)
  103. other_key_line = f_key_mmap.readline().decode(config.ENCODING_CHARSET)
  104. other_key_index, other_key,other_word_root = re_extract_key(pattern, other_key_line)
  105. # 计算相关性
  106. val = cal.cal_cos(key, other_key, word_root, other_word_root)
  107. if val >= 0.8:
  108. # 设置bitmap,该关键词已经处理过
  109. bm.set(other_key_index)
  110. # 记录类似的关键词
  111. agg_cache.append(other_key)
  112. # 保存到本地
  113. with open(config.AGG_ANALYSE_FILE % key, "w", encoding=config.ENCODING_CHARSET) as f:
  114. for item in agg_cache:
  115. f.write(item)
  116. f.write("\n")
  117. # 如果所有的关键词都处理完则结束
  118. if bm.all():
  119. logging.info("bitmap全部为1")
  120. break
  121. else:
  122. count = bm.count()
  123. logging.info("已处理数量:%d / %d,剩余数量:%d / %d" % (count, total_count, (total_count - count), total_count))
  124. tools.log_end_msg(TITLE)
  125. if __name__ == "__main__":
  126. main()