tools.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. # -*- coding:utf-8 -*-
  2. import math
  3. import logging
  4. import os
  5. import config
  6. import logging.config
  7. import pickle
  8. import mmap
  9. TITLE = "工具类"
  10. tip_internal_cache = {}
  11. def init_log():
  12. """
  13. 日志初始化工具
  14. """
  15. # 读取日志配置文件内容
  16. logging.config.fileConfig('./logging.conf')
  17. # 用一个没有在配置文件中定义的logger名称来创建一个日志器logger
  18. return logging.getLogger()
  19. def log_start_msg(msg):
  20. """
  21. 执行开始时的简易日志输出
  22. """
  23. logging.info("-----------------%s 开始-----------------" % msg)
  24. def log_end_msg(msg):
  25. """
  26. 执行结束时的简易日志输出
  27. """
  28. logging.info("-----------------%s 结束-----------------" % msg)
  29. def get_tip_internal(total_num):
  30. """
  31. 计算进度提示间隔
  32. """
  33. # 尝试从缓存中获取
  34. internal = tip_internal_cache.get(total_num)
  35. # 不存在则进行计算并放入缓存中
  36. if not internal:
  37. internal = math.ceil(total_num * config.PRECENT_TIPS)
  38. tip_internal_cache[total_num] = internal
  39. return internal
  40. def tip(total_num, cur_num, is_zero_base=True):
  41. """
  42. 简易进度提示
  43. total_num 总数量
  44. cur_num 当前进度(0基)
  45. internal 提示间隔
  46. """
  47. # TODO
  48. # 修改成百分比提示
  49. internal = get_tip_internal(total_num)
  50. # cur_num + 1 是0基修正
  51. if is_zero_base:
  52. cur_num = cur_num + 1
  53. # 进度提示
  54. if cur_num == total_num:
  55. logging.info("当前进度 %d / %d" % (total_num, total_num))
  56. elif cur_num % internal == 0:
  57. logging.info("当前进度 %d / %d" % (cur_num, total_num))
  58. def tip_in_size(total_size, cur_pos):
  59. """
  60. 简易进度提示(用于不知道总行数的情形)
  61. total_size 总数量
  62. cur_num 当前进度
  63. """
  64. # 尝试从缓存中获取
  65. tip_internal = tip_internal_cache.get(total_size)
  66. if not tip_internal:
  67. # 不存在缓存,构建 提示检查点 和 提示间隔 信息
  68. internal = math.ceil(total_size * config.PRECENT_TIPS)
  69. tip_internal= {
  70. "check_point": cur_pos,
  71. "internal": internal
  72. }
  73. # 放入缓存
  74. tip_internal_cache[total_size] = tip_internal
  75. # 当前位置超过提示检查点则显示进度
  76. if cur_pos >= tip_internal["check_point"]:
  77. logging.info("当前进度 %d / %d" % (cur_pos, total_size))
  78. # 修改 提示检查点
  79. check_point = tip_internal["check_point"]
  80. internal = tip_internal["internal"]
  81. while cur_pos >= check_point:
  82. check_point = check_point + internal
  83. # 如果 提示检查点大于总值,则置为总值
  84. if check_point > total_size:
  85. check_point = total_size
  86. # 如果不手动中断会陷入循环
  87. break
  88. # 更新 提示检查点
  89. tip_internal["check_point"] = check_point
  90. def save_obj(path, obj):
  91. """
  92. 保存对象至本地
  93. """
  94. with open(path, "wb") as f:
  95. pickle.dump(obj, f)
  96. def load_obj(path):
  97. """
  98. 加载对象
  99. """
  100. with open(path, "rb") as f:
  101. return pickle.load(f)
  102. def load_stop_word():
  103. """
  104. 加载停用词
  105. """
  106. # 判断是否存在缓存
  107. if os.path.exists(config.STOP_WORD_CACHE):
  108. logging.debug("存在停用词缓存")
  109. return load_obj(config.STOP_WORD_CACHE)
  110. logging.debug("正在构建停用词缓存")
  111. # 停用词容器
  112. stop_word = set()
  113. # 构建停用词列表
  114. stop_word_files = os.listdir(config.STOP_WORD_DIR)
  115. for file in stop_word_files:
  116. stop_word_file = os.path.join(config.STOP_WORD_DIR, file)
  117. with open(stop_word_file, encoding=config.ENCODING_CHARSET) as f:
  118. for item in f:
  119. # 移除换行符
  120. stop_word.add(item.replace("\n","").replace("\r", ""))
  121. # 改成dict提升检索速度
  122. stop_word_dict = {}
  123. for item in stop_word:
  124. stop_word_dict[item]=None
  125. logging.debug("把停用词缓存保存到本地")
  126. # 保存本地作为缓存
  127. save_obj(config.STOP_WORD_CACHE, stop_word_dict)
  128. return stop_word_dict
  129. def avg_split_task(total:int, split_internal:int):
  130. """
  131. 平分任务
  132. """
  133. # 分割的任务份数
  134. split_num = math.ceil(total / split_internal)
  135. # 平分
  136. tasks = []
  137. for i in range(split_num):
  138. # 计算平分点在列表中的位置
  139. start_pos = i * split_internal
  140. end_pos = i * split_internal + split_internal
  141. # 如果超过列表大小需要额外处理
  142. if end_pos >= total:
  143. end_pos = -1
  144. tasks.append([start_pos,end_pos])
  145. return tasks
  146. if __name__ == "__main__":
  147. stop_word = load_stop_word()
  148. with open("./data/stopword.txt","w",encoding="UTF-8") as f:
  149. for stopWord in stop_word.keys():
  150. f.write("%s\n" % stopWord)