# -*- coding:utf-8 -*- import math import os import pickle # 停用词存放文件夹 STOP_WORD_DIR = "./resources/stopwords" # 临时文件路径 TEMP_PATH = "../tmp" # 停用词模型 STOP_WORD_CACHE = "stop_word.pkl" # 正则表达式中需要额外处理的特殊符号 RE_SPECIAL_SYMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"] def save_obj(path, obj): """ 保存对象至本地 """ with open(path, "wb") as f: pickle.dump(obj, f) def load_obj(path): """ 加载对象 """ with open(path, "rb") as f: return pickle.load(f) def load_stop_word(): """ 加载停用词 """ # 判断临时文件路径是否存在,不存在则重新创建 if not os.path.exists(TEMP_PATH): os.makedirs(TEMP_PATH) # 判断是否存在缓存 stop_word_cache_path = os.path.join(TEMP_PATH, STOP_WORD_CACHE) if os.path.exists(stop_word_cache_path) and os.path.isfile(stop_word_cache_path): return load_obj(stop_word_cache_path) # 停用词容器 stop_word = set() # 构建停用词列表 stop_word_files = os.listdir(STOP_WORD_DIR) for file in stop_word_files: stop_word_file = os.path.join(STOP_WORD_DIR, file) with open(stop_word_file, encoding="UTF-8") as f: for item in f: # 移除换行符 stop_word.add(item.replace("\n", "").replace("\r", "")) # 改成dict提升检索速度 stop_word_dict = {} for item in stop_word: stop_word_dict[item] = None # 保存本地作为缓存 save_obj(stop_word_cache_path, stop_word_dict) return stop_word_dict def avg_split_task(total: int, split_internal: int, start=0): """ 平分任务,包含开始位置,不包含结束位置,开始位置是从0开始 :param start: 开始位置 :param total: 任务总数量 :param split_internal: 每份数量 :return: (开始位置,结束位置) """ # 分割的任务份数 split_num = math.ceil(total / split_internal) # 平分 tasks = [] for i in range(split_num): # 计算平分点在列表中的位置 start_pos = i * split_internal end_pos = i * split_internal + split_internal if i == 0: start_pos = start # 如果超过列表大小需要额外处理 if end_pos >= total: end_pos = -1 tasks.append([start_pos, end_pos]) return tasks def remove_line_break(line: str): """ 移除换行符 :param line: 待处理文本 :return: 替换后的结果 """ if line: return line.replace("\r", "").replace("\n", "") return line