| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- # -*- coding:utf-8 -*-
- import sys
- from time import time
- import os
- import config
- import tools
- import ast
- import re
- import stop_word
- TITLE = "关键词倒排文件"
- def main():
- """
- 构建待排表
- """
- tools.init_log()
- tools.log_start_msg(TITLE)
- # 提取规则
- s = r"(\d+),([^,]*),(.*)"
- pattern = re.compile(s, re.I)
- # 倒排表 容器
- key_reverse = {}
- # 停用表
- stop_word_cache = stop_word.load_stop_word()
- with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey:
-
- # 获取文件总大小,获取后需要复原光标位置
- fkey.seek(0, os.SEEK_END)
- total_num = fkey.tell()
- fkey.seek(0)
- while True:
- # 获取当前处理位置
- cur_pos = fkey.tell()
-
- # 进度提示
- tools.tip_in_size(total_num, cur_pos)
- # 读取关键词数据
- line = fkey.readline()
- # 如果到行尾则结束
- if not line:
- break
- # 提取数据
- m = pattern.match(line)
- # 获取关键词序号
- index = m.group(1)
- # 获取词根
- key_root = m.group(3)
- # 转换成真正的list对象
- for item in ast.literal_eval(key_root):
-
- # 排除停用词
- if item in stop_word_cache:
- continue
- # 构建倒排表
- val = key_reverse.get(item)
- if val:
- key_reverse[item].append(index)
- else:
- key_reverse[item]=[]
- key_reverse[item].append(index)
- # 保存到本地文件
- with open(config.KEY_REVERSE_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
- for key, value in key_reverse.items():
- f.write("%s,%s\n" % (key, value))
- tools.log_end_msg(TITLE)
- if __name__ == "__main__":
- main()
- # 测试加载耗时
- # start = time()
- # key_reverse_cache = tools.load_obj(config.KEY_REVERSE_CACHE)
- # end = time()
- # print("占用大小:", sys.getsizeof(key_reverse_cache))
- # print("加载耗时:", end-start)
|