key.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. # -*- coding:utf-8 -*-
  2. import logging
  3. import config
  4. import tools
  5. import jieba
  6. import datetime
  7. import mmap
  8. TITLE = "关键词表 生成"
  9. def main():
  10. # 日志配置初始化
  11. tools.init_log()
  12. tools.log_start_msg(TITLE)
  13. with open(config.MERGE_FILE, "r", encoding=config.ENCODING_CHARSET) as fmerge, \
  14. open(config.KEY_FILE, "w", encoding=config.ENCODING_CHARSET) as fw, \
  15. mmap.mmap(fmerge.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
  16. # TODO
  17. # 这里可能有IO优化的余地
  18. # 这里可以不用mmap,改用一条一条readline()进行读取
  19. # 进度提示也不完整
  20. count = -1
  21. total_num = fmmap.size()
  22. while True:
  23. count = count + 1
  24. # 读取关键词
  25. word = fmmap.readline().decode("UTF-8").replace("\r","").replace("\n","")
  26. # 读取不到任何内容结束执行
  27. if not word :
  28. break
  29. # 分词
  30. word_root = list(jieba.cut_for_search(word))
  31. # 写入文件中
  32. fw.write("%d,%s,%s\n"%(count,word,word_root))
  33. # 进度提示
  34. tools.tip(total_num, fmmap.tell(), False)
  35. tools.log_end_msg(TITLE)
  36. if __name__ == '__main__':
  37. main()