# -*- coding:utf-8 -*- from datetime import datetime import os import time import tools import config import pickle import logging TITLE = "停用词" def load_stop_word(): """ 加载停用词 """ # 判断是否存在缓存 if os.path.exists(config.STOP_WORD_CACHE): logging.debug("存在停用词缓存") return tools.load_obj(config.STOP_WORD_CACHE) logging.debug("正在构建停用词缓存") # 停用词容器 stop_word = [] # 构建停用词列表 stop_word_files = os.listdir(config.STOP_WORD_DIR) for file in stop_word_files: stop_word_file = os.path.join(config.STOP_WORD_DIR, file) with open(stop_word_file, encoding=config.ENCODING_CHARSET) as f: for item in f: # 移除换行符 stop_word.append(item.replace("\n","")) # 去重 stop_word = list(set(stop_word)) # 把list改成dict提升检索速度 stop_word_dict = {} for item in stop_word: stop_word_dict[item]=None logging.debug("把停用词缓存保存到本地") # 保存本地作为缓存 tools.save_obj(config.STOP_WORD_CACHE, stop_word_dict) return stop_word_dict if __name__ == '__main__': tools.init_log() tools.log_start_msg(TITLE) stop_word = load_stop_word() start = time.time() for i in range(1400*10000): for item in ["总之", "风雨无阻","千"]: item in stop_word end = time.time() print("耗时:", end - start) tools.log_end_msg(TITLE)