| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- # -*- coding:utf-8 -*-
- import mmap
- import config
- import tools
- import stop_word
- import re
- import ast
- import cal
- import logging
- import ast
- from bitmap import BitMap
- TITLE = "聚合文件"
- def re_extract_key(pattern, line):
- """
- 正则提取关键词信息
- """
- m = pattern.match(line)
- # 关键词 序号
- index = m.group(1)
- # 关键词
- key = m.group(2)
- # 关键词 分词词根
- word_root = m.group(3)
- # 把index转换成数字方便使用
- return int(index), key, word_root
- def main():
- # 初始化日志配置
- tools.init_log()
- tools.log_start_msg(TITLE)
- # 停用词
- logging.info("加载停用词")
- stop_word_cache = stop_word.load_stop_word()
- # 关键词索引
- logging.info("加载关键词索引")
- key_index_cache = tools.load_obj(config.KEY_INDEX_CACHE)
- # 倒排索引
- logging.info("加载倒排索引")
- key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
- # 正则 提取数据
- s = r"(\d+),([^,]*),(.*)"
- pattern = re.compile(s, re.I)
- with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
- open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
- mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap, \
- mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as f_reverse_mmap:
- # 计算总关键词数
- # TODO 这里要改成从统计信息中获取
- total_count = 14500029
-
- # 生成位图bitmap
- bm = BitMap(total_count)
- # 待处理的文件总大小
- total_num = f_key_mmap.size()
- while True:
- # 当前处理位置
- cur_pos = f_key_mmap.tell()
- # 进度提示
- tools.tip_in_size(total_num, cur_pos)
- # 获取要处理的关键词
- line = f_key_mmap.readline().decode(config.ENCODING_CHARSET)
-
- # 如果没有任何内容则结束
- if not line:
- logging.info("发现空白line")
- break
- # 提取信息
- index, key, word_root = re_extract_key(pattern, line)
-
- # bitmap校验,如果已经处理过则跳过
- if bm.test(index):
- logging.debug("主关键词:%s 已处理,跳过" % key)
- continue
- # 通过bitmap校验,设置对应的bit为0
- bm.set(index)
- # 聚合结果存放容器
- agg_cache = []
- # 记录主要关键词
- agg_cache.append(key)
- # 转换成真正的list对象
- logging.debug("当前处理的主关键词:%s, 词根数量:%d" % (key, len(word_root)))
- for item in ast.literal_eval(word_root):
- # 排除停用词
- if item in stop_word_cache:
- continue
-
- # 根据倒排索引,获取相关的关键词序号
- other_key_pos = key_reverse_index_cache.get(item)
- f_reverse_mmap.seek(other_key_pos)
- other_key_line = f_reverse_mmap.readline().decode(config.ENCODING_CHARSET)
- # 截取关键词索引部分
- other_index = other_key_line.index(",")
- other_key_indexs = other_key_line[other_index+1:]
- # 转换成真正的list对象
- other_key_indexs = ast.literal_eval(other_key_indexs)
- if not other_key_indexs:
- continue
- logging.debug("词根:%s, 涉及的其它关键词数量:%d" % (item, len(other_key_indexs)))
- for other_key_index in other_key_indexs:
- # bitmap校验,如果已经处理过则跳过
- if bm.test(int(other_key_index)):
- logging.debug("待比较关键词:%s 已处理,跳过" % other_key_index)
- continue
- # 从关键词索引中获取关键词位置
- pos = key_index_cache[other_key_index]
- # 获取待比较的关键词
- f_key_mmap.seek(pos)
- other_key_line = f_key_mmap.readline().decode(config.ENCODING_CHARSET)
- other_key_index, other_key,other_word_root = re_extract_key(pattern, other_key_line)
- # 计算相关性
- val = cal.cal_cos(key, other_key, word_root, other_word_root)
- if val >= 0.8:
- # 设置bitmap,该关键词已经处理过
- bm.set(other_key_index)
- # 记录类似的关键词
- agg_cache.append(other_key)
-
- # 保存到本地
- with open(config.AGG_ANALYSE_FILE % key, "w", encoding=config.ENCODING_CHARSET) as f:
- for item in agg_cache:
- f.write(item)
- f.write("\n")
- # 如果所有的关键词都处理完则结束
- if bm.all():
- logging.info("bitmap全部为1")
- break
- else:
- count = bm.count()
- logging.info("已处理数量:%d / %d,剩余数量:%d / %d" % (count, total_count, (total_count - count), total_count))
- tools.log_end_msg(TITLE)
- if __name__ == "__main__":
- main()
|