statistics.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. # -*- coding:utf-8 -*-
  2. from collections import namedtuple
  3. from dataclasses import make_dataclass
  4. import os
  5. import tools
  6. import config
  7. import logging
  8. import random
  9. import time
  10. import ast
  11. import mmap
  12. TASK_TITLE = "数据统计分析"
  13. def transfer_str(num):
  14. msg = None
  15. if num >= 10000:
  16. msg = "%d万%d" % (num//10000, num%10000)
  17. else:
  18. msg = str(num)
  19. return msg
  20. def cal(list):
  21. list_len = len(list)
  22. list_count = sum(list)
  23. sum_msg = transfer_str(list_len)
  24. count_msg = transfer_str(list_count)
  25. avg_msg = transfer_str(int(list_count/list_len))
  26. return sum_msg, count_msg, avg_msg
  27. def tip(condition, list):
  28. logging.info("条件:%s - 涉及:%s个词根,涉及词数:%s,平均约:%s 词数/词根" % ((condition,)+ cal(list)))
  29. def test_tip(list, ele_num):
  30. start =time.time()
  31. tmp = ast.literal_eval(str(random.sample(list, ele_num)))
  32. end =time.time()
  33. logging.info("%s个元素的字符列表转换成对象耗时%s" % (transfer_str(ele_num), end-start))
  34. def cost_statistics():
  35. with open(config.KEY_REVERSE_STATISTICS_FILE, "r", encoding=config.ENCODING_CHARSET) as f:
  36. count_list= []
  37. total_count=0
  38. for line in f:
  39. first_index = line.index(",")
  40. count = int(line[first_index+1:])
  41. count_list.append(count)
  42. total_count = total_count + count
  43. logging.info("总祠根数:%d, 涉及的总分词查找数:%d" % (len(count_list), total_count))
  44. tip("等于1", [val for val in count_list if val == 1])
  45. tip("大于1小于100", [val for val in count_list if val > 1 and val < 100])
  46. tip("大于等于100小于200", [val for val in count_list if val >= 100 and val < 200])
  47. tip("大于等于200小于300", [val for val in count_list if val >= 200 and val < 300])
  48. tip("大于等于300小于400", [val for val in count_list if val >= 300 and val < 400])
  49. tip("大于等于400小于500", [val for val in count_list if val >= 400 and val < 500])
  50. tip("大于等于500小于1000", [val for val in count_list if val >= 500 and val < 1000])
  51. tip("大于等于1000小于5000", [val for val in count_list if val >= 1000 and val < 5000])
  52. tip("大于等于5000小于1万", [val for val in count_list if val >= 5000 and val < 10000])
  53. tip("大于等于1万小于5万", [val for val in count_list if val >= 10000 and val < 50000])
  54. tip("大于等于5万小于10万", [val for val in count_list if val >= 50000 and val < 100000])
  55. tip("大于等于10万", [val for val in count_list if val >= 100000])
  56. sample_list = [i for i in range(14500029)]
  57. test_tip(sample_list, 1)
  58. test_tip(sample_list, 10)
  59. test_tip(sample_list, 50)
  60. test_tip(sample_list, 100)
  61. test_tip(sample_list, 200)
  62. test_tip(sample_list, 300)
  63. test_tip(sample_list, 400)
  64. test_tip(sample_list, 500)
  65. test_tip(sample_list, 1000)
  66. test_tip(sample_list, 5000)
  67. test_tip(sample_list, 10000)
  68. test_tip(sample_list, 50000)
  69. test_tip(sample_list, 100000)
  70. test_tip(sample_list, 595528)
  71. test_tip(sample_list, 689520)
  72. test_tip(sample_list, 776035)
  73. test_tip(sample_list, 822266)
  74. test_tip(sample_list, 951491)
  75. def memory_statistics():
  76. key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
  77. end_pos = key_reverse_index_cache["导不出"]
  78. logging.info("查找结束位置")
  79. with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
  80. mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
  81. logging.info("开始构建缓存")
  82. cache = {}
  83. start = time.time()
  84. while True:
  85. cur_pos = fmmap.tell()
  86. if cur_pos > end_pos:
  87. break
  88. line = fmmap.readline().decode("UTF-8")
  89. first_index = line.index(",")
  90. key = line[:first_index]
  91. # 转换
  92. word_root = line[first_index+1:]
  93. cache[key]=ast.literal_eval(word_root)
  94. end = time.time()
  95. logging.info('构建热点缓存完成,耗时:%s,缓存数量:%d' % ((end-start), len(cache)))
  96. logging.info('把缓存保存到本地')
  97. tools.save_obj(config.KEY_REVERSE_INDEX_HOT_CACHE, cache)
  98. logging.info('保存结束')
  99. time.sleep(20)
  100. logging.info('留20s进行内存观察')
  101. def main():
  102. tools.init_log()
  103. tools.log_start_msg(TASK_TITLE)
  104. memory_statistics()
  105. tools.log_end_msg(TASK_TITLE)
  106. if __name__ == "__main__":
  107. main()