| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- # -*- coding:utf-8 -*-
- import config
- import tools
- import mmap
- TITLE = "关键词倒排索引"
- def main():
- # 日志配置初始化
- tools.init_log()
- tools.log_start_msg(TITLE)
- # 关键词倒排索引容器
- reverse_index = []
- with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
- mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
-
- # 总大小
- total_num = fmmap.size()
- while True:
- # 读取光标位置
- cur_pos = fmmap.tell()
- # 把光标移动到下一行
- line = fmmap.readline().decode(config.ENCODING_CHARSET)
- # 如果没有数据则结束
- if not line :
- break
-
- # 获取词根位置,建立词根和位置的关系
- index = line.index(",")
- key = line[:index]
- next_pos = fmmap.tell()
- reverse_index.append((key, cur_pos, next_pos))
-
- # 进度显示
- tools.tip_in_size(total_num, cur_pos)
-
- # 保存索引
- with open("./data/tmp/reverse_index_test.csv", "w", encoding=config.ENCODING_CHARSET) as f:
- for key, cur_pos, next_pos in reverse_index:
- f.write("%s,%d,%d\n" % (key, cur_pos, next_pos))
- # tools.save_obj(config.KEY_REVERSE_INDEX_CACHE, key_reverse_index_cache)
- tools.log_end_msg(TITLE)
- if __name__ == "__main__":
- main()
|