|
@@ -2,19 +2,12 @@
|
|
|
|
|
|
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
import mmap
|
|
import mmap
|
|
|
-from multiprocessing.connection import wait
|
|
|
|
|
-import random
|
|
|
|
|
-import sys
|
|
|
|
|
-from time import sleep, time
|
|
|
|
|
import os
|
|
import os
|
|
|
import config
|
|
import config
|
|
|
import tools
|
|
import tools
|
|
|
import ast
|
|
import ast
|
|
|
-import re
|
|
|
|
|
-import stop_word
|
|
|
|
|
import logging
|
|
import logging
|
|
|
import math
|
|
import math
|
|
|
-from multiprocessing import Process, Pool
|
|
|
|
|
|
|
|
|
|
TITLE = "关键词倒排文件 统计"
|
|
TITLE = "关键词倒排文件 统计"
|
|
|
|
|
|
|
@@ -26,7 +19,7 @@ def handle(start_pos, end_pos):
|
|
|
|
|
|
|
|
# 统计信息容器
|
|
# 统计信息容器
|
|
|
reverse_statistics = {}
|
|
reverse_statistics = {}
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
|
|
with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
|
|
|
mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
|
|
mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
|
|
|
# 调整开始位置
|
|
# 调整开始位置
|
|
@@ -37,23 +30,23 @@ def handle(start_pos, end_pos):
|
|
|
# 越界检测
|
|
# 越界检测
|
|
|
if cur_pos >= end_pos:
|
|
if cur_pos >= end_pos:
|
|
|
break
|
|
break
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
line = fmmap.readline().decode(config.ENCODING_CHARSET)
|
|
line = fmmap.readline().decode(config.ENCODING_CHARSET)
|
|
|
index=line.index(",")
|
|
index=line.index(",")
|
|
|
key = line[:index]
|
|
key = line[:index]
|
|
|
word_root = line[index+1:]
|
|
word_root = line[index+1:]
|
|
|
word_root = ast.literal_eval(word_root)
|
|
word_root = ast.literal_eval(word_root)
|
|
|
l = len(word_root)
|
|
l = len(word_root)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
reverse_statistics[key]=l
|
|
reverse_statistics[key]=l
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
logging.info("进程:%d, 统计结束" % os.getpid())
|
|
logging.info("进程:%d, 统计结束" % os.getpid())
|
|
|
|
|
|
|
|
return {
|
|
return {
|
|
|
"pid":os.getpid(),
|
|
"pid":os.getpid(),
|
|
|
"statistics":reverse_statistics
|
|
"statistics":reverse_statistics
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
def main2():
|
|
def main2():
|
|
|
# 日志信息配置
|
|
# 日志信息配置
|
|
@@ -89,7 +82,7 @@ def main2():
|
|
|
# 记录位置信息
|
|
# 记录位置信息
|
|
|
pos_list.append(pos)
|
|
pos_list.append(pos)
|
|
|
|
|
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 使用用进程池
|
|
# 使用用进程池
|
|
|
pool = ProcessPoolExecutor(process_num)
|
|
pool = ProcessPoolExecutor(process_num)
|
|
|
# 生成任务
|
|
# 生成任务
|
|
@@ -97,7 +90,7 @@ def main2():
|
|
|
for i in range(0, len(pos_list)-1):
|
|
for i in range(0, len(pos_list)-1):
|
|
|
pos = pos_list[i: i+2]
|
|
pos = pos_list[i: i+2]
|
|
|
process_futures.append(pool.submit(handle, pos[0], pos[1]))
|
|
process_futures.append(pool.submit(handle, pos[0], pos[1]))
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
|
|
# with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
|
|
|
# for future in as_completed(process_futures):
|
|
# for future in as_completed(process_futures):
|
|
|
# logging.info("部分子任务统计结束,保存至本地 - 开始")
|
|
# logging.info("部分子任务统计结束,保存至本地 - 开始")
|
|
@@ -105,7 +98,7 @@ def main2():
|
|
|
# fw.write("%s,%s\n"%(key,value))
|
|
# fw.write("%s,%s\n"%(key,value))
|
|
|
# logging.info("部分子任务统计结束,保存至本地 - 结束")
|
|
# logging.info("部分子任务统计结束,保存至本地 - 结束")
|
|
|
|
|
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
results = []
|
|
results = []
|
|
|
for future in as_completed(process_futures):
|
|
for future in as_completed(process_futures):
|
|
|
result = future.result()
|
|
result = future.result()
|
|
@@ -142,14 +135,14 @@ def main2():
|
|
|
# p = Process(target=handle, args=('测试进程', 1, 10))
|
|
# p = Process(target=handle, args=('测试进程', 1, 10))
|
|
|
# p.start()
|
|
# p.start()
|
|
|
# p.join()
|
|
# p.join()
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# tools.init_log()
|
|
# tools.init_log()
|
|
|
# tools.log_start_msg(TITLE)
|
|
# tools.log_start_msg(TITLE)
|
|
|
|
|
|
|
|
# key_reverse_index = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
|
|
# key_reverse_index = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
|
|
|
|
|
|
|
|
# tmp = [key for key in key_reverse_index.keys()]
|
|
# tmp = [key for key in key_reverse_index.keys()]
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# l = len(tmp)
|
|
# l = len(tmp)
|
|
|
# print("总长:", l)
|
|
# print("总长:", l)
|
|
|
# internal = math.ceil(l / 4)
|
|
# internal = math.ceil(l / 4)
|
|
@@ -168,7 +161,7 @@ def main2():
|
|
|
# pos = key_reverse_index[key[0]]
|
|
# pos = key_reverse_index[key[0]]
|
|
|
# print(key, pos)
|
|
# print(key, pos)
|
|
|
|
|
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# reverse_statistics = {}
|
|
# reverse_statistics = {}
|
|
|
# logging.info("统计开始")
|
|
# logging.info("统计开始")
|
|
|
# with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
|
|
# with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
|
|
@@ -179,7 +172,7 @@ def main2():
|
|
|
# word_root = line[index+1:]
|
|
# word_root = line[index+1:]
|
|
|
# word_root = ast.literal_eval(word_root)
|
|
# word_root = ast.literal_eval(word_root)
|
|
|
# l = len(word_root)
|
|
# l = len(word_root)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# reverse_statistics[key]=l
|
|
# reverse_statistics[key]=l
|
|
|
|
|
|
|
|
# logging.info("统计结束,保存至本地")
|
|
# logging.info("统计结束,保存至本地")
|
|
@@ -193,7 +186,7 @@ def main2():
|
|
|
def main():
|
|
def main():
|
|
|
tools.init_log()
|
|
tools.init_log()
|
|
|
tools.log_start_msg(TITLE)
|
|
tools.log_start_msg(TITLE)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
reverse_statistics = {}
|
|
reverse_statistics = {}
|
|
|
logging.info("统计开始")
|
|
logging.info("统计开始")
|
|
|
with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
|
|
with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
|
|
@@ -204,7 +197,7 @@ def main():
|
|
|
word_root = line[index+1:]
|
|
word_root = line[index+1:]
|
|
|
word_root = ast.literal_eval(word_root)
|
|
word_root = ast.literal_eval(word_root)
|
|
|
l = len(word_root)
|
|
l = len(word_root)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
reverse_statistics[key]=l
|
|
reverse_statistics[key]=l
|
|
|
|
|
|
|
|
logging.info("统计结束,保存至本地")
|
|
logging.info("统计结束,保存至本地")
|
|
@@ -215,4 +208,4 @@ def main():
|
|
|
tools.log_end_msg(TITLE)
|
|
tools.log_end_msg(TITLE)
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
- main2()
|
|
|
|
|
|
|
+ main2()
|