key_reverse_index.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. # -*- coding:utf-8 -*-
  2. import config
  3. import tools
  4. import mmap
  5. TITLE = "关键词倒排索引"
  6. def main():
  7. # 日志配置初始化
  8. tools.init_log()
  9. tools.log_start_msg(TITLE)
  10. # 关键词倒排索引容器
  11. reverse_index = []
  12. with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
  13. mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
  14. # 总大小
  15. total_num = fmmap.size()
  16. while True:
  17. # 读取光标位置
  18. cur_pos = fmmap.tell()
  19. # 把光标移动到下一行
  20. line = fmmap.readline().decode(config.ENCODING_CHARSET)
  21. # 如果没有数据则结束
  22. if not line :
  23. break
  24. # 获取词根位置,建立词根和位置的关系
  25. index = line.index(",")
  26. key = line[:index]
  27. next_pos = fmmap.tell()
  28. reverse_index.append((key, cur_pos, next_pos))
  29. # 进度显示
  30. tools.tip_in_size(total_num, cur_pos)
  31. # 保存索引
  32. with open("./data/tmp/reverse_index_test.csv", "w", encoding=config.ENCODING_CHARSET) as f:
  33. for key, cur_pos, next_pos in reverse_index:
  34. f.write("%s,%d,%d\n" % (key, cur_pos, next_pos))
  35. # tools.save_obj(config.KEY_REVERSE_INDEX_CACHE, key_reverse_index_cache)
  36. tools.log_end_msg(TITLE)
  37. if __name__ == "__main__":
  38. main()