utils.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # -*- coding:utf-8 -*-
  2. import json
  3. import math
  4. import os
  5. import pickle
  6. from src import constant
  7. # 停用词存放文件夹
  8. STOP_WORD_DIR = os.path.join(constant.GLOBAL_PROJECT_RESOURCE_DIR, "stopwords")
  9. # 停用词模型
  10. STOP_WORD_CACHE = "stop_word.pkl"
  11. # 正则表达式中需要额外处理的特殊符号
  12. RE_SPECIAL_SYMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
  13. def save_obj(path, obj):
  14. """
  15. 保存对象至本地
  16. """
  17. with open(path, "wb") as f:
  18. pickle.dump(obj, f)
  19. def load_obj(path):
  20. """
  21. 加载对象
  22. """
  23. with open(path, "rb") as f:
  24. return pickle.load(f)
  25. def load_stop_word():
  26. """
  27. 加载停用词
  28. """
  29. # 判断临时文件路径是否存在,不存在则重新创建
  30. if not os.path.exists(constant.GLOBAL_PROJECT_TEMP_DIR):
  31. os.makedirs(constant.GLOBAL_PROJECT_TEMP_DIR)
  32. # 判断是否存在缓存
  33. stop_word_cache_path = os.path.join(constant.GLOBAL_PROJECT_TEMP_DIR, STOP_WORD_CACHE)
  34. if os.path.exists(stop_word_cache_path) and os.path.isfile(stop_word_cache_path):
  35. return load_obj(stop_word_cache_path)
  36. # 停用词容器
  37. stop_word = set()
  38. # 构建停用词列表
  39. stop_word_files = os.listdir(STOP_WORD_DIR)
  40. for file in stop_word_files:
  41. stop_word_file = os.path.join(STOP_WORD_DIR, file)
  42. with open(stop_word_file, encoding="UTF-8") as f:
  43. for item in f:
  44. # 移除换行符
  45. stop_word.add(item.replace("\n", "").replace("\r", ""))
  46. # 改成dict提升检索速度
  47. stop_word_dict = {}
  48. for item in stop_word:
  49. stop_word_dict[item] = None
  50. # 保存本地作为缓存
  51. save_obj(stop_word_cache_path, stop_word_dict)
  52. return stop_word_dict
  53. def avg_split_task(total: int, split_internal: int, start=0):
  54. """
  55. 平分任务,包含开始位置,不包含结束位置,开始位置是从0开始
  56. :param start: 开始位置
  57. :param total: 任务总数量
  58. :param split_internal: 每份数量
  59. :return: (开始位置,结束位置)
  60. """
  61. # 分割的任务份数
  62. split_num = math.ceil(total / split_internal)
  63. # 平分
  64. tasks = []
  65. for i in range(split_num):
  66. # 计算平分点在列表中的位置
  67. start_pos = i * split_internal
  68. end_pos = i * split_internal + split_internal
  69. if i == 0:
  70. start_pos = start
  71. # 如果超过列表大小需要额外处理
  72. if end_pos >= total:
  73. end_pos = -1
  74. tasks.append([start_pos, end_pos])
  75. return tasks
  76. def remove_line_break(line: str):
  77. """
  78. 移除换行符
  79. :param line: 待处理文本
  80. :return: 替换后的结果
  81. """
  82. if line:
  83. return line.replace("\r", "").replace("\n", "")
  84. return line
  85. def saveJson(save_path: str, save_obj: dict):
  86. """
  87. 保存为json文件
  88. :param save_path: 保存的路径
  89. :param save_obj: 保存的内容对象
  90. :return:
  91. """
  92. # 判断临时文件路径是否存在,不存在则重新创建
  93. if not os.path.exists(constant.GLOBAL_PROJECT_TEMP_DIR):
  94. os.makedirs(constant.GLOBAL_PROJECT_TEMP_DIR)
  95. with open(save_path, 'w', encoding='utf-8') as f:
  96. f.write(json.dumps(save_obj))
  97. def load_json(path: str):
  98. """
  99. 加载json文件
  100. :param path:
  101. :return:
  102. """
  103. if os.path.exists(path) and os.path.isfile(path):
  104. with open(path, 'r', encoding='utf-8') as f:
  105. return json.loads(f.read())
  106. return dict()