| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- # -*- coding:utf-8 -*-
- import logging
- import config
- import tools
- import jieba
- import datetime
- import mmap
- TITLE = "关键词表 生成"
- def main():
-
- # 日志配置初始化
- tools.init_log()
- tools.log_start_msg(TITLE)
- with open(config.MERGE_FILE, "r", encoding=config.ENCODING_CHARSET) as fmerge, \
- open(config.KEY_FILE, "w", encoding=config.ENCODING_CHARSET) as fw, \
- mmap.mmap(fmerge.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
- # TODO
- # 这里可能有IO优化的余地
- # 这里可以不用mmap,改用一条一条readline()进行读取
- # 进度提示也不完整
- count = -1
- total_num = fmmap.size()
- while True:
- count = count + 1
- # 读取关键词
- word = fmmap.readline().decode("UTF-8").replace("\r","").replace("\n","")
- # 读取不到任何内容结束执行
- if not word :
- break
-
- # 分词
- word_root = list(jieba.cut_for_search(word))
- # 写入文件中
- fw.write("%d,%s,%s\n"%(count,word,word_root))
- # 进度提示
- tools.tip(total_num, fmmap.tell(), False)
- tools.log_end_msg(TITLE)
- if __name__ == '__main__':
- main()
|