| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- # -*- coding:utf-8 -*-
- import json
- import math
- import os
- import pickle
- from src import constant
- # 停用词存放文件夹
- STOP_WORD_DIR = os.path.join(constant.GLOBAL_PROJECT_RESOURCE_DIR, "stopwords")
- # 停用词模型
- STOP_WORD_CACHE = "stop_word.pkl"
- # 正则表达式中需要额外处理的特殊符号
- RE_SPECIAL_SYMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
- def save_obj(path, obj):
- """
- 保存对象至本地
- """
- with open(path, "wb") as f:
- pickle.dump(obj, f)
- def load_obj(path):
- """
- 加载对象
- """
- with open(path, "rb") as f:
- return pickle.load(f)
- def load_stop_word():
- """
- 加载停用词
- """
- # 判断临时文件路径是否存在,不存在则重新创建
- if not os.path.exists(constant.GLOBAL_PROJECT_TEMP_DIR):
- os.makedirs(constant.GLOBAL_PROJECT_TEMP_DIR)
- # 判断是否存在缓存
- stop_word_cache_path = os.path.join(constant.GLOBAL_PROJECT_TEMP_DIR, STOP_WORD_CACHE)
- if os.path.exists(stop_word_cache_path) and os.path.isfile(stop_word_cache_path):
- return load_obj(stop_word_cache_path)
- # 停用词容器
- stop_word = set()
- # 构建停用词列表
- stop_word_files = os.listdir(STOP_WORD_DIR)
- for file in stop_word_files:
- stop_word_file = os.path.join(STOP_WORD_DIR, file)
- with open(stop_word_file, encoding="UTF-8") as f:
- for item in f:
- # 移除换行符
- stop_word.add(item.replace("\n", "").replace("\r", ""))
- # 改成dict提升检索速度
- stop_word_dict = {}
- for item in stop_word:
- stop_word_dict[item] = None
- # 保存本地作为缓存
- save_obj(stop_word_cache_path, stop_word_dict)
- return stop_word_dict
- def avg_split_task(total: int, split_internal: int, start=0):
- """
- 平分任务,包含开始位置,不包含结束位置,开始位置是从0开始
- :param start: 开始位置
- :param total: 任务总数量
- :param split_internal: 每份数量
- :return: (开始位置,结束位置)
- """
- # 分割的任务份数
- split_num = math.ceil(total / split_internal)
- # 平分
- tasks = []
- for i in range(split_num):
- # 计算平分点在列表中的位置
- start_pos = i * split_internal
- end_pos = i * split_internal + split_internal
- if i == 0:
- start_pos = start
- # 如果超过列表大小需要额外处理
- if end_pos >= total:
- end_pos = -1
- tasks.append([start_pos, end_pos])
- return tasks
- def remove_line_break(line: str):
- """
- 移除换行符
- :param line: 待处理文本
- :return: 替换后的结果
- """
- if line:
- return line.replace("\r", "").replace("\n", "")
- return line
- def saveJson(save_path: str, save_obj: dict):
- """
- 保存为json文件
- :param save_path: 保存的路径
- :param save_obj: 保存的内容对象
- :return:
- """
- # 判断临时文件路径是否存在,不存在则重新创建
- if not os.path.exists(constant.GLOBAL_PROJECT_TEMP_DIR):
- os.makedirs(constant.GLOBAL_PROJECT_TEMP_DIR)
- with open(save_path, 'w', encoding='utf-8') as f:
- f.write(json.dumps(save_obj))
- def load_json(path: str):
- """
- 加载json文件
- :param path:
- :return:
- """
- if os.path.exists(path) and os.path.isfile(path):
- with open(path, 'r', encoding='utf-8') as f:
- return json.loads(f.read())
- return dict()
|