| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- # -*- coding:utf-8 -*-
- from datetime import datetime
- import os
- import time
- import tools
- import config
- import pickle
- import logging
- TITLE = "停用词"
- def load_stop_word():
- """
- 加载停用词
- """
- # 判断是否存在缓存
- if os.path.exists(config.STOP_WORD_CACHE):
- logging.debug("存在停用词缓存")
- return tools.load_obj(config.STOP_WORD_CACHE)
- logging.debug("正在构建停用词缓存")
- # 停用词容器
- stop_word = []
- # 构建停用词列表
- stop_word_files = os.listdir(config.STOP_WORD_DIR)
- for file in stop_word_files:
- stop_word_file = os.path.join(config.STOP_WORD_DIR, file)
- with open(stop_word_file, encoding=config.ENCODING_CHARSET) as f:
- for item in f:
- # 移除换行符
- stop_word.append(item.replace("\n",""))
- # 去重
- stop_word = list(set(stop_word))
- # 把list改成dict提升检索速度
- stop_word_dict = {}
- for item in stop_word:
- stop_word_dict[item]=None
-
- logging.debug("把停用词缓存保存到本地")
- # 保存本地作为缓存
- tools.save_obj(config.STOP_WORD_CACHE, stop_word_dict)
-
- return stop_word_dict
- if __name__ == '__main__':
- tools.init_log()
- tools.log_start_msg(TITLE)
- stop_word = load_stop_word()
- start = time.time()
- for i in range(1400*10000):
- for item in ["总之", "风雨无阻","千"]:
- item in stop_word
- end = time.time()
- print("耗时:", end - start)
- tools.log_end_msg(TITLE)
|