# -*- coding:utf-8 -*- import logging import config import tools import jieba import datetime import mmap TITLE = "关键词表 生成" def main(): # 日志配置初始化 tools.init_log() tools.log_start_msg(TITLE) with open(config.MERGE_FILE, "r", encoding=config.ENCODING_CHARSET) as fmerge, \ open(config.KEY_FILE, "w", encoding=config.ENCODING_CHARSET) as fw, \ mmap.mmap(fmerge.fileno(), 0, access=mmap.ACCESS_READ) as fmmap: # TODO # 这里可能有IO优化的余地 # 这里可以不用mmap,改用一条一条readline()进行读取 # 进度提示也不完整 count = -1 total_num = fmmap.size() while True: count = count + 1 # 读取关键词 word = fmmap.readline().decode("UTF-8").replace("\r","").replace("\n","") # 读取不到任何内容结束执行 if not word : break # 分词 word_root = list(jieba.cut_for_search(word)) # 写入文件中 fw.write("%d,%s,%s\n"%(count,word,word_root)) # 进度提示 tools.tip(total_num, fmmap.tell(), False) tools.log_end_msg(TITLE) if __name__ == '__main__': main()