stop_word.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. # -*- coding:utf-8 -*-
  2. from datetime import datetime
  3. import os
  4. import time
  5. import tools
  6. import config
  7. import pickle
  8. import logging
  9. TITLE = "停用词"
  10. def load_stop_word():
  11. """
  12. 加载停用词
  13. """
  14. # 判断是否存在缓存
  15. if os.path.exists(config.STOP_WORD_CACHE):
  16. logging.debug("存在停用词缓存")
  17. return tools.load_obj(config.STOP_WORD_CACHE)
  18. logging.debug("正在构建停用词缓存")
  19. # 停用词容器
  20. stop_word = []
  21. # 构建停用词列表
  22. stop_word_files = os.listdir(config.STOP_WORD_DIR)
  23. for file in stop_word_files:
  24. stop_word_file = os.path.join(config.STOP_WORD_DIR, file)
  25. with open(stop_word_file, encoding=config.ENCODING_CHARSET) as f:
  26. for item in f:
  27. # 移除换行符
  28. stop_word.append(item.replace("\n",""))
  29. # 去重
  30. stop_word = list(set(stop_word))
  31. # 把list改成dict提升检索速度
  32. stop_word_dict = {}
  33. for item in stop_word:
  34. stop_word_dict[item]=None
  35. logging.debug("把停用词缓存保存到本地")
  36. # 保存本地作为缓存
  37. tools.save_obj(config.STOP_WORD_CACHE, stop_word_dict)
  38. return stop_word_dict
  39. if __name__ == '__main__':
  40. tools.init_log()
  41. tools.log_start_msg(TITLE)
  42. stop_word = load_stop_word()
  43. start = time.time()
  44. for i in range(1400*10000):
  45. for item in ["总之", "风雨无阻","千"]:
  46. item in stop_word
  47. end = time.time()
  48. print("耗时:", end - start)
  49. tools.log_end_msg(TITLE)