zkpk
/
money-mining-python


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							# -*- coding:utf-8 -*-

from datetime import datetime
import os
import time
import tools
import config
import pickle
import logging

TITLE = "停用词"

def load_stop_word():
    """
    加载停用词
    """

    # 判断是否存在缓存
    if os.path.exists(config.STOP_WORD_CACHE):
        logging.debug("存在停用词缓存")
        return tools.load_obj(config.STOP_WORD_CACHE)

    logging.debug("正在构建停用词缓存")

    # 停用词容器
    stop_word = []

    # 构建停用词列表
    stop_word_files = os.listdir(config.STOP_WORD_DIR)
    for file in stop_word_files:
        stop_word_file = os.path.join(config.STOP_WORD_DIR, file)
        with open(stop_word_file, encoding=config.ENCODING_CHARSET) as f:
            for item in f:
                # 移除换行符
                stop_word.append(item.replace("\n",""))
    # 去重
    stop_word = list(set(stop_word))

    # 把list改成dict提升检索速度
    stop_word_dict = {}
    for item in stop_word:
        stop_word_dict[item]=None
    
    logging.debug("把停用词缓存保存到本地")

    # 保存本地作为缓存
    tools.save_obj(config.STOP_WORD_CACHE, stop_word_dict)
    
    return stop_word_dict

if __name__ == '__main__':

    tools.init_log()
    tools.log_start_msg(TITLE)

    stop_word = load_stop_word()

    start = time.time()
    for i in range(1400*10000):
        for item in ["总之", "风雨无阻","千"]:
            item in stop_word
    end = time.time()
    print("耗时：", end - start)

    tools.log_end_msg(TITLE)