| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- # -*- coding:utf-8 -*-
- from collections import namedtuple
- from dataclasses import make_dataclass
- import os
- import tools
- import config
- import logging
- import random
- import time
- import ast
- import mmap
- TASK_TITLE = "数据统计分析"
- def transfer_str(num):
- msg = None
- if num >= 10000:
- msg = "%d万%d" % (num//10000, num%10000)
- else:
- msg = str(num)
- return msg
- def cal(list):
- list_len = len(list)
- list_count = sum(list)
- sum_msg = transfer_str(list_len)
- count_msg = transfer_str(list_count)
- avg_msg = transfer_str(int(list_count/list_len))
- return sum_msg, count_msg, avg_msg
- def tip(condition, list):
- logging.info("条件:%s - 涉及:%s个词根,涉及词数:%s,平均约:%s 词数/词根" % ((condition,)+ cal(list)))
- def test_tip(list, ele_num):
- start =time.time()
- tmp = ast.literal_eval(str(random.sample(list, ele_num)))
- end =time.time()
- logging.info("%s个元素的字符列表转换成对象耗时%s" % (transfer_str(ele_num), end-start))
- def cost_statistics():
- with open(config.KEY_REVERSE_STATISTICS_FILE, "r", encoding=config.ENCODING_CHARSET) as f:
- count_list= []
- total_count=0
- for line in f:
- first_index = line.index(",")
- count = int(line[first_index+1:])
- count_list.append(count)
- total_count = total_count + count
-
- logging.info("总祠根数:%d, 涉及的总分词查找数:%d" % (len(count_list), total_count))
-
- tip("等于1", [val for val in count_list if val == 1])
- tip("大于1小于100", [val for val in count_list if val > 1 and val < 100])
- tip("大于等于100小于200", [val for val in count_list if val >= 100 and val < 200])
- tip("大于等于200小于300", [val for val in count_list if val >= 200 and val < 300])
- tip("大于等于300小于400", [val for val in count_list if val >= 300 and val < 400])
- tip("大于等于400小于500", [val for val in count_list if val >= 400 and val < 500])
-
- tip("大于等于500小于1000", [val for val in count_list if val >= 500 and val < 1000])
- tip("大于等于1000小于5000", [val for val in count_list if val >= 1000 and val < 5000])
- tip("大于等于5000小于1万", [val for val in count_list if val >= 5000 and val < 10000])
- tip("大于等于1万小于5万", [val for val in count_list if val >= 10000 and val < 50000])
- tip("大于等于5万小于10万", [val for val in count_list if val >= 50000 and val < 100000])
- tip("大于等于10万", [val for val in count_list if val >= 100000])
- sample_list = [i for i in range(14500029)]
- test_tip(sample_list, 1)
- test_tip(sample_list, 10)
- test_tip(sample_list, 50)
- test_tip(sample_list, 100)
- test_tip(sample_list, 200)
- test_tip(sample_list, 300)
- test_tip(sample_list, 400)
- test_tip(sample_list, 500)
- test_tip(sample_list, 1000)
- test_tip(sample_list, 5000)
- test_tip(sample_list, 10000)
- test_tip(sample_list, 50000)
- test_tip(sample_list, 100000)
- test_tip(sample_list, 595528)
- test_tip(sample_list, 689520)
- test_tip(sample_list, 776035)
- test_tip(sample_list, 822266)
- test_tip(sample_list, 951491)
- def memory_statistics():
- key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
- end_pos = key_reverse_index_cache["导不出"]
- logging.info("查找结束位置")
- with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
- mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
-
- logging.info("开始构建缓存")
- cache = {}
- start = time.time()
- while True:
- cur_pos = fmmap.tell()
-
- if cur_pos > end_pos:
- break
- line = fmmap.readline().decode("UTF-8")
- first_index = line.index(",")
- key = line[:first_index]
- # 转换
- word_root = line[first_index+1:]
- cache[key]=ast.literal_eval(word_root)
- end = time.time()
- logging.info('构建热点缓存完成,耗时:%s,缓存数量:%d' % ((end-start), len(cache)))
- logging.info('把缓存保存到本地')
- tools.save_obj(config.KEY_REVERSE_INDEX_HOT_CACHE, cache)
- logging.info('保存结束')
- time.sleep(20)
- logging.info('留20s进行内存观察')
-
- def main():
- tools.init_log()
- tools.log_start_msg(TASK_TITLE)
- memory_statistics()
-
- tools.log_end_msg(TASK_TITLE)
- if __name__ == "__main__":
- main()
|