3 rokov pred · 2217b0b5bd
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,9 @@ data/tmp/*.csv
 
				 data/pkl/*.pkl
			
 
				 data/*.txt
			
 
				 data/*.csv
			
 
				-临时/
			
 
				+临时/
			
 
				+data_bak/
			
 
				+src_bak/
			
 
				+data/analyse/
			
 
				+data/analyse_bak/
			
 
				+data/cache/
			
--- a/REMEAD.md
+++ b/REMEAD.md
@@ -0,0 +1,20 @@
 
				+# 处理步骤
			
 
				+
			
 
				+1. 从5118下载泛词（csv文件）
			
 
				+
			
 
				+2. 对泛词进行分词处理（cut.py）
			
 
				+
			
 
				+    * 分词和词频统计
			
 
				+    * 根据词频进行倒序排列
			
 
				+
			
 
				+3. 根据词频获取拓展词
			
 
				+
			
 
				+4. 把所有拓展词合并到一个文件中（merge.py）
			
 
				+
			
 
				+5. 生成关键词文件，包含三个要素：序号、关键词、分词结果（key.py）
			
 
				+
			
 
				+6. 对关键词文件生成索引文件（key_index.py）
			
 
				+
			
 
				+7. 根据关键词文件生成倒排文件（key_reverse.py）
			
 
				+
			
 
				+8. 根据关键词文件、索引文件、倒排文件生成最终的聚合分析文件（agg_word.py）
			
--- a/agg_word.py
+++ b/agg_word.py
@@ -0,0 +1,155 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import mmap
			
 
				+import config
			
 
				+import tools
			
 
				+import stop_word
			
 
				+import re
			
 
				+import ast
			
 
				+import cal
			
 
				+import logging
			
 
				+import ast
			
 
				+from bitmap import BitMap
			
 
				+
			
 
				+TITLE = "聚合文件"
			
 
				+
			
 
				+def re_extract_key(pattern, line):
			
 
				+    """
			
 
				+    正则提取关键词信息
			
 
				+    """
			
 
				+    m = pattern.match(line)
			
 
				+    # 关键词 序号
			
 
				+    index = m.group(1)
			
 
				+    # 关键词 
			
 
				+    key = m.group(2)
			
 
				+    # 关键词 分词词根
			
 
				+    word_root = m.group(3)
			
 
				+    # 把index转换成数字方便使用
			
 
				+    return int(index), key, word_root
			
 
				+
			
 
				+def main():
			
 
				+    # 初始化日志配置
			
 
				+    tools.init_log()
			
 
				+    tools.log_start_msg(TITLE)
			
 
				+
			
 
				+    # 停用词
			
 
				+    logging.info("加载停用词")
			
 
				+    stop_word_cache = stop_word.load_stop_word()
			
 
				+    # 关键词索引
			
 
				+    logging.info("加载关键词索引")
			
 
				+    key_index_cache = tools.load_obj(config.KEY_INDEX_CACHE)
			
 
				+    # 倒排索引
			
 
				+    logging.info("加载倒排索引")
			
 
				+    key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
			
 
				+    # 正则 提取数据
			
 
				+    s = r"(\d+),([^,]*),(.*)"
			
 
				+    pattern = re.compile(s, re.I)
			
 
				+
			
 
				+    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
			
 
				+        open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
			
 
				+        mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap, \
			
 
				+        mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as f_reverse_mmap:
			
 
				+
			
 
				+        # 计算总关键词数
			
 
				+
			
 
				+        # TODO 这里要改成从统计信息中获取
			
 
				+        total_count = 14500029
			
 
				+        
			
 
				+        # 生成位图bitmap
			
 
				+        bm = BitMap(total_count)
			
 
				+
			
 
				+        # 待处理的文件总大小
			
 
				+        total_num = f_key_mmap.size()
			
 
				+
			
 
				+        while True:
			
 
				+            # 当前处理位置
			
 
				+            cur_pos = f_key_mmap.tell()
			
 
				+
			
 
				+            # 进度提示
			
 
				+            tools.tip_in_size(total_num, cur_pos)
			
 
				+
			
 
				+            # 获取要处理的关键词
			
 
				+            line = f_key_mmap.readline().decode(config.ENCODING_CHARSET)
			
 
				+            
			
 
				+            # 如果没有任何内容则结束 
			
 
				+            if not line:
			
 
				+                logging.info("发现空白line")
			
 
				+                break
			
 
				+
			
 
				+            # 提取信息
			
 
				+            index, key, word_root = re_extract_key(pattern, line)
			
 
				+            
			
 
				+            # bitmap校验，如果已经处理过则跳过
			
 
				+            if bm.test(index):
			
 
				+                logging.debug("主关键词：%s 已处理，跳过" % key)
			
 
				+                continue
			
 
				+
			
 
				+            # 通过bitmap校验，设置对应的bit为0
			
 
				+            bm.set(index)
			
 
				+
			
 
				+            # 聚合结果存放容器
			
 
				+            agg_cache = []
			
 
				+
			
 
				+            # 记录主要关键词
			
 
				+            agg_cache.append(key)
			
 
				+
			
 
				+            # 转换成真正的list对象
			
 
				+            logging.debug("当前处理的主关键词：%s, 词根数量：%d" % (key, len(word_root)))
			
 
				+            for item in ast.literal_eval(word_root):
			
 
				+                # 排除停用词
			
 
				+                if item in stop_word_cache:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 根据倒排索引，获取相关的关键词序号
			
 
				+                other_key_pos = key_reverse_index_cache.get(item)
			
 
				+                f_reverse_mmap.seek(other_key_pos)
			
 
				+                other_key_line = f_reverse_mmap.readline().decode(config.ENCODING_CHARSET)
			
 
				+                # 截取关键词索引部分
			
 
				+                other_index = other_key_line.index(",")
			
 
				+                other_key_indexs = other_key_line[other_index+1:]
			
 
				+                # 转换成真正的list对象
			
 
				+                other_key_indexs = ast.literal_eval(other_key_indexs)
			
 
				+                if not other_key_indexs:
			
 
				+                    continue
			
 
				+
			
 
				+                logging.debug("词根：%s, 涉及的其它关键词数量：%d" % (item, len(other_key_indexs)))
			
 
				+                for other_key_index in other_key_indexs:
			
 
				+                    # bitmap校验，如果已经处理过则跳过
			
 
				+                    if bm.test(int(other_key_index)):
			
 
				+                        logging.debug("待比较关键词：%s 已处理，跳过" % other_key_index)
			
 
				+                        continue
			
 
				+
			
 
				+                    # 从关键词索引中获取关键词位置
			
 
				+                    pos = key_index_cache[other_key_index]
			
 
				+                    # 获取待比较的关键词
			
 
				+                    f_key_mmap.seek(pos)
			
 
				+                    other_key_line = f_key_mmap.readline().decode(config.ENCODING_CHARSET)
			
 
				+                    other_key_index, other_key,other_word_root = re_extract_key(pattern, other_key_line)
			
 
				+
			
 
				+                    # 计算相关性
			
 
				+                    val = cal.cal_cos(key, other_key, word_root, other_word_root)
			
 
				+                    if val >= 0.8:
			
 
				+                        # 设置bitmap，该关键词已经处理过
			
 
				+                        bm.set(other_key_index)
			
 
				+
			
 
				+                        # 记录类似的关键词
			
 
				+                        agg_cache.append(other_key)
			
 
				+        
			
 
				+            # 保存到本地
			
 
				+            with open(config.AGG_ANALYSE_FILE % key, "w", encoding=config.ENCODING_CHARSET) as f:
			
 
				+                for item in agg_cache:
			
 
				+                    f.write(item)
			
 
				+                    f.write("\n")
			
 
				+
			
 
				+            # 如果所有的关键词都处理完则结束
			
 
				+            if bm.all():
			
 
				+                logging.info("bitmap全部为1")
			
 
				+                break
			
 
				+            else:
			
 
				+                count = bm.count()
			
 
				+                logging.info("已处理数量：%d / %d，剩余数量：%d / %d" % (count, total_count, (total_count - count), total_count))
			
 
				+
			
 
				+    tools.log_end_msg(TITLE)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/analyse.py
+++ b/analyse.py
@@ -1,143 +0,0 @@
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-import math
			
 
				-import jieba
			
 
				-import re
			
 
				-import os
			
 
				-import pickle
			
 
				-
			
 
				-word_dict = {}
			
 
				-
			
 
				-SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
			
 
				-
			
 
				-def cut_word(word):
			
 
				-    """
			
 
				-    分词
			
 
				-    """
			
 
				-    word_root = jieba.cut_for_search(word)
			
 
				-    return list(word_root)
			
 
				-
			
 
				-def merge_word_root(word_root_a, word_root_b):
			
 
				-    """
			
 
				-    合并词根
			
 
				-    """
			
 
				-    return list(set(word_root_a).union(set(word_root_b)))
			
 
				-
			
 
				-def gen_word_vector(word_a, word_b, word_root_union):
			
 
				-    """
			
 
				-    生成词向量
			
 
				-    """
			
 
				-    a_word_vector, b_word_vector = [], []
			
 
				-    for word in word_root_union:
			
 
				-        if word in SPECIAL_SIMBOL :
			
 
				-            word = "\\" + word
			
 
				-        a_word_vector.append(len(re.findall(word, word_a)))
			
 
				-        b_word_vector.append(len(re.findall(word, word_b)))
			
 
				-    return a_word_vector, b_word_vector
			
 
				-
			
 
				-def vector_multi(a_vector, b_vector):
			
 
				-    """
			
 
				-    向量相乘求和
			
 
				-    """
			
 
				-    return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
			
 
				-
			
 
				-def vector_square_sum(word_vector):
			
 
				-    """
			
 
				-    向量平方求和
			
 
				-    """
			
 
				-    sum = 0
			
 
				-    for i in word_vector:
			
 
				-        sum = sum + i * i
			
 
				-    return sum
			
 
				-
			
 
				-def vector_cos(v_multi, a_v_ss, b_v_ss):
			
 
				-    """
			
 
				-    计算余弦值
			
 
				-    """
			
 
				-    return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
			
 
				-
			
 
				-def cal_cos(a_word, b_word, word_dict):
			
 
				-    """
			
 
				-    计算两个长尾关键词的余弦值
			
 
				-    """
			
 
				-    # a_word_root = cut_word(a_word)
			
 
				-    # b_word_root = cut_word(b_word)
			
 
				-
			
 
				-    a_word_root = word_dict[a_word]
			
 
				-    b_word_root = word_dict[b_word]
			
 
				-
			
 
				-    # 合并词根，用于生成词向量
			
 
				-    union_word_root = merge_word_root(a_word_root, b_word_root)
			
 
				-
			
 
				-    # 生成词向量
			
 
				-    a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
			
 
				-
			
 
				-    # 词向量相乘求和
			
 
				-    ab_vector_multi = vector_multi(a_vector, b_vector)
			
 
				-
			
 
				-    # 向量平方求和
			
 
				-    a_vector_squar_sum = vector_square_sum(a_vector)
			
 
				-    b_vector_squar_sum = vector_square_sum(b_vector)
			
 
				-
			
 
				-    cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
			
 
				-
			
 
				-    return cos_val
			
 
				-
			
 
				-
			
 
				-def load_word_root_cache():
			
 
				-    word_root_cache = {}
			
 
				-
			
 
				-    if os.path.exists("./data/pkl/word_root_cache.pkl"):
			
 
				-        print("存在缓存，开始加载")
			
 
				-        with open("./data/pkl/word_root_cache.pkl", "rb") as f:
			
 
				-            word_root_cache = pickle.load(f)
			
 
				-            return word_root_cache
			
 
				-
			
 
				-    print('不存在缓存，开始构建分词字典')
			
 
				-    with open("./data/分词结果_bak.txt", "r", encoding="UTF-8") as f:
			
 
				-        lines = f.readlines()
			
 
				-        for line in lines:
			
 
				-            index = line.index(",")
			
 
				-            word_root_cache[line[:index]] = line[index+1:]
			
 
				-    
			
 
				-    print("构建完成，保存到本地")
			
 
				-    with open("./data/pkl/word_root_cache.pkl", "wb") as f:
			
 
				-        pickle.dump(word_root_cache, f)
			
 
				-    
			
 
				-    return word_root_cache
			
 
				-
			
 
				-word_dict = load_word_root_cache()
			
 
				-
			
 
				-key_list = list(word_dict.keys())
			
 
				-for i, a_key in enumerate(key_list[:-1]):
			
 
				-    with open("./data/category/%s.txt" % a_key, "w", encoding="UTF-8") as f:
			
 
				-        f.write(a_key)
			
 
				-        f.write("\n\n")
			
 
				-
			
 
				-        del_container = []
			
 
				-        for j, b_key in  enumerate(key_list[i+1:]):
			
 
				-            if j % 100000 == 0 :
			
 
				-                print("正在处理：%d, %d" % (i, j))
			
 
				-            cos_val = cal_cos(a_key, b_key, word_dict)
			
 
				-            if cos_val > 0.8 :
			
 
				-                print("%s 与 %s 的余弦值：%f " % (a_key, b_key, cos_val))
			
 
				-                f.write(b_key)
			
 
				-                f.write("\n")
			
 
				-        
			
 
				-        key_list.remove(a_key)
			
 
				-        if len(del_container) > 0:
			
 
				-            print("删除已处理的元素")
			
 
				-            for item in del_container:
			
 
				-                key_list.remove(item)
			
 
				-
			
 
				-
			
 
				-
			
 
				-# a_word = "腋下长了一个小疙瘩是什么东西"
			
 
				-# b_word = "什么东西吃蟑螂(四个字)"
			
 
				-# cos_val = cal_cos(a_word, b_word)
			
 
				-# print(cos_val)
			
 
				-# print("的余弦值：%f " % ( cos_val))
			
 
				-# print(cut_word(b_word))
			
 
				-
			
 
				-# 1. 使用缓存中的分词
			
 
				-# 2. 余弦值超过0.8的视为一组
			
--- a/analyse2.py
+++ b/analyse2.py
@@ -1,146 +0,0 @@
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-import os
			
 
				-import re
			
 
				-import pickle
			
 
				-import math
			
 
				-from multiprocessing import Manager, Process
			
 
				-
			
 
				-
			
 
				-# 分词结果
			
 
				-CUT_WORD_RESULT = "./data/分词结果_bak.txt"
			
 
				-
			
 
				-# 分词缓存
			
 
				-CUT_WORD_CACHE = "./data/pkl/word_root_cache.pkl"
			
 
				-
			
 
				-# 分析保存结果
			
 
				-ANALYSE_OUTPUT_FILE = "./data/category/%s.txt"
			
 
				-
			
 
				-# 正则表达式中的特殊符号
			
 
				-SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
			
 
				-
			
 
				-def merge_word_root(word_root_a, word_root_b):
			
 
				-    """
			
 
				-    合并词根
			
 
				-    """
			
 
				-    return list(set(word_root_a).union(set(word_root_b)))
			
 
				-
			
 
				-def gen_word_vector(word_a, word_b, word_root_union):
			
 
				-    """
			
 
				-    生成词向量
			
 
				-    """
			
 
				-    a_word_vector, b_word_vector = [], []
			
 
				-    for word in word_root_union:
			
 
				-        if word in SPECIAL_SIMBOL :
			
 
				-            word = "\\" + word
			
 
				-        a_word_vector.append(len(re.findall(word, word_a)))
			
 
				-        b_word_vector.append(len(re.findall(word, word_b)))
			
 
				-    return a_word_vector, b_word_vector
			
 
				-
			
 
				-def vector_multi(a_vector, b_vector):
			
 
				-    """
			
 
				-    向量相乘求和
			
 
				-    """
			
 
				-    return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
			
 
				-
			
 
				-def vector_square_sum(word_vector):
			
 
				-    """
			
 
				-    向量平方求和
			
 
				-    """
			
 
				-    sum = 0
			
 
				-    for i in word_vector:
			
 
				-        sum = sum + i * i
			
 
				-    return sum
			
 
				-
			
 
				-def vector_cos(v_multi, a_v_ss, b_v_ss):
			
 
				-    """
			
 
				-    计算余弦值
			
 
				-    """
			
 
				-    return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
			
 
				-
			
 
				-def cal_cos(a_word, b_word, word_dict):
			
 
				-    """
			
 
				-    计算两个长尾关键词的余弦值
			
 
				-    """
			
 
				-    a_word_root = word_dict[a_word]
			
 
				-    b_word_root = word_dict[b_word]
			
 
				-
			
 
				-    # 合并词根，用于生成词向量
			
 
				-    union_word_root = merge_word_root(a_word_root, b_word_root)
			
 
				-
			
 
				-    # 生成词向量
			
 
				-    a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
			
 
				-
			
 
				-    # 词向量相乘求和
			
 
				-    ab_vector_multi = vector_multi(a_vector, b_vector)
			
 
				-
			
 
				-    # 向量平方求和
			
 
				-    a_vector_squar_sum = vector_square_sum(a_vector)
			
 
				-    b_vector_squar_sum = vector_square_sum(b_vector)
			
 
				-
			
 
				-    cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
			
 
				-
			
 
				-    return cos_val
			
 
				-
			
 
				-def process(global_word_root, global_del_cache, a_key, keys):
			
 
				-    container = []
			
 
				-    total_num = len(keys)
			
 
				-    for j, b_key in  enumerate(keys):
			
 
				-        if j % 100000 == 0 :
			
 
				-            print("处理进度：%d / %d" % (j, total_num))
			
 
				-        cos_val = cal_cos(a_key, b_key, global_word_root)
			
 
				-        if cos_val > 0.8 and b_key not in global_del_cache :
			
 
				-            print("%s 与 %s 的余弦值：%f " % (a_key, b_key, cos_val))
			
 
				-            container.append(b_key)
			
 
				-            global_del_cache.append(b_key)
			
 
				-
			
 
				-    with open(ANALYSE_OUTPUT_FILE % a_key, "w", encoding="UTF-8") as f:
			
 
				-        f.write(a_key)
			
 
				-        f.write("\n")
			
 
				-        for b_key in container:
			
 
				-            f.write(b_key)
			
 
				-            f.write("\n")
			
 
				-
			
 
				-def load_word_root_cache():
			
 
				-    """
			
 
				-    加载分词缓存
			
 
				-    """
			
 
				-
			
 
				-    word_root_cache = {}
			
 
				-
			
 
				-    if os.path.exists(CUT_WORD_CACHE):
			
 
				-        print("存在缓存，开始加载")
			
 
				-        with open(CUT_WORD_CACHE, "rb") as f:
			
 
				-            word_root_cache = pickle.load(f)
			
 
				-            return word_root_cache
			
 
				-
			
 
				-    print('不存在缓存，开始构建分词字典')
			
 
				-    with open(CUT_WORD_RESULT, "r", encoding="UTF-8") as f:
			
 
				-        lines = f.readlines()
			
 
				-        for line in lines:
			
 
				-            index = line.index(",")
			
 
				-            word_root_cache[line[:index]] = line[index+1:]
			
 
				-    
			
 
				-    print("构建完成，保存到本地")
			
 
				-    with open(CUT_WORD_CACHE, "wb") as f:
			
 
				-        pickle.dump(word_root_cache, f)
			
 
				-    
			
 
				-    return word_root_cache
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-
			
 
				-    word_root_cache = load_word_root_cache();
			
 
				-
			
 
				-    keys = [key for key in word_root_cache.keys()]
			
 
				-
			
 
				-    manager = Manager()
			
 
				-    global_word_root = manager.dict(word_root_cache)
			
 
				-    global_del_cache = manager.list()
			
 
				-
			
 
				-    p = Process(target=process, args=(global_word_root, global_del_cache, keys[0], keys[1:]))
			
 
				-    p.join()
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/cal.py
+++ b/cal.py
@@ -0,0 +1,74 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import config
			
 
				+import re
			
 
				+import math
			
 
				+
			
 
				+def merge_word_root(word_root_a, word_root_b):
			
 
				+    """
			
 
				+    合并词根
			
 
				+    """
			
 
				+    return list(set(word_root_a).union(set(word_root_b)))
			
 
				+
			
 
				+def gen_word_vector(word_a, word_b, word_root_union):
			
 
				+    """
			
 
				+    生成词向量
			
 
				+    """
			
 
				+    a_word_vector, b_word_vector = [], []
			
 
				+    for word in word_root_union:
			
 
				+        if word in config.RE_SPECIAL_SIMBOL :
			
 
				+            word = "\\" + word
			
 
				+        a_word_vector.append(len(re.findall(word, word_a)))
			
 
				+        b_word_vector.append(len(re.findall(word, word_b)))
			
 
				+    return a_word_vector, b_word_vector
			
 
				+
			
 
				+def vector_multi(a_vector, b_vector):
			
 
				+    """
			
 
				+    向量相乘求和
			
 
				+    """
			
 
				+    return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
			
 
				+
			
 
				+def vector_square_sum(word_vector):
			
 
				+    """
			
 
				+    向量平方求和
			
 
				+    """
			
 
				+    sum = 0
			
 
				+    for i in word_vector:
			
 
				+        sum = sum + i * i
			
 
				+    return sum
			
 
				+
			
 
				+def vector_cos(v_multi, a_v_ss, b_v_ss):
			
 
				+    """
			
 
				+    计算余弦值
			
 
				+    """
			
 
				+    return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
			
 
				+
			
 
				+
			
 
				+def cal_cos(a_word, b_word, a_word_root, b_word_root):
			
 
				+    """
			
 
				+    计算两个长尾关键词的余弦值
			
 
				+    """
			
 
				+
			
 
				+    # 合并词根，用于生成词向量
			
 
				+    union_word_root = merge_word_root(a_word_root, b_word_root)
			
 
				+
			
 
				+    # 生成词向量
			
 
				+    a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
			
 
				+
			
 
				+    # 词向量相乘求和
			
 
				+    ab_vector_multi = vector_multi(a_vector, b_vector)
			
 
				+
			
 
				+    # 向量平方求和
			
 
				+    a_vector_squar_sum = vector_square_sum(a_vector)
			
 
				+    b_vector_squar_sum = vector_square_sum(b_vector)
			
 
				+
			
 
				+    cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
			
 
				+
			
 
				+    return cos_val
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    a_word = "腋下长了一个小疙瘩是什么东西"
			
 
				+    b_word = "白凉粉是什么东西"
			
 
				+    a_word_root = ['腋下', '长', '了', '一个', '小', '疙瘩', '是', '什么', '东西']
			
 
				+    b_word_root = ['白', '凉粉', '是', '什么', '东西']
			
 
				+    print(cal_cos(a_word, b_word, a_word_root, b_word_root))
			
--- a/config.py
+++ b/config.py
@@ -0,0 +1,43 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+# 文件编码格式
			
 
				+ENCODING_CHARSET = "UTF-8"
			
 
				+
			
 
				+# 分词与词频统计
			
 
				+CUT_FILE = "./data/tmp/cut.csv"
			
 
				+
			
 
				+# 拓展词合并文件
			
 
				+MERGE_FILE = "./data/tmp/merge.csv"
			
 
				+
			
 
				+# 排除合并的文件
			
 
				+MERGE_EXCLUDE_FILES = ['打开乱码如何处理？.txt']
			
 
				+
			
 
				+# 关键词文件（包含三要素：序号、关键词、词根）
			
 
				+KEY_FILE = "./data/tmp/key.csv"
			
 
				+
			
 
				+# 关键词索引文件（包含两个要素：关键词序号、在文件中的位置）（暂时弃用）
			
 
				+KEY_INDEX_FILE = "./data/tmp/key_index.csv"
			
 
				+
			
 
				+# 关键词索引模型 缓存 （包含两个要素：关键词序号、在文件中的位置）
			
 
				+KEY_INDEX_CACHE = "./data/cache/key_index.pkl"
			
 
				+
			
 
				+# 关键词倒排文件（包含两个要素：词根、关键词序号）
			
 
				+KEY_REVERSE_FILE = "./data/tmp/key_reverse.csv"
			
 
				+
			
 
				+# 关键词倒排索引模型 缓存 （包含两个要素：词根、位置）
			
 
				+KEY_REVERSE_INDEX_CACHE = "./data/cache/key_reverse_index.pkl"
			
 
				+
			
 
				+# 最终的聚合分析结果存放文件
			
 
				+AGG_ANALYSE_FILE = "./data/analyse/%s.csv"
			
 
				+
			
 
				+# 停用词存放文件夹
			
 
				+STOP_WORD_DIR = "./data/stopwords"
			
 
				+
			
 
				+# 停用词模型 缓存
			
 
				+STOP_WORD_CACHE = "./data/cache/stop_word.pkl"
			
 
				+
			
 
				+# 正则表达式中需要额外处理的特殊符号
			
 
				+RE_SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
			
 
				+
			
 
				+# 百分比进度提示
			
 
				+PRECENT_TIPS = 0.01
			
--- a/cut.py
+++ b/cut.py
@@ -0,0 +1,97 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import config
			
 
				+import os
			
 
				+import tools
			
 
				+import jieba
			
 
				+import logging
			
 
				+import logging.config
			
 
				+
			
 
				+from stop_word import load_stop_word
			
 
				+
			
 
				+TITLE = "分词处理"
			
 
				+
			
 
				+# 待处理的数据文件
			
 
				+DATA_FILE = "E:\Download\怎么长尾词_1655561719.csv"
			
 
				+
			
 
				+def cut_word_and_statistics(data):
			
 
				+
			
 
				+    """
			
 
				+    分词并统计词频
			
 
				+    """
			
 
				+
			
 
				+    logging.info("开始执行分词操作并进行词频统计")
			
 
				+
			
 
				+    # 分词结果容器
			
 
				+    key_dict = {}
			
 
				+    # 停用词
			
 
				+    stop_word = load_stop_word()
			
 
				+    # 待处理数据总数量
			
 
				+    total_num = len(data)
			
 
				+
			
 
				+    logging.info("共需处理 %d 条数据" % total_num)
			
 
				+
			
 
				+    for i, item in enumerate(data):
			
 
				+        # 只需要第一列的数据
			
 
				+        longTailKey = item.split(",")[0]
			
 
				+        # 移除换行符
			
 
				+        longTailKey = longTailKey.replace("\n", "")
			
 
				+        # 分词
			
 
				+        cutWord = jieba.cut_for_search(longTailKey)
			
 
				+
			
 
				+        # 统计
			
 
				+        for word in cutWord:
			
 
				+
			
 
				+            # 过滤停用词
			
 
				+            if word in stop_word:
			
 
				+                continue
			
 
				+
			
 
				+            if word in key_dict:
			
 
				+                key_dict[word] = key_dict[word] + 1
			
 
				+            else:
			
 
				+                key_dict[word] = 1
			
 
				+        
			
 
				+        # 进度提示
			
 
				+        tools.tip(total_num, i)
			
 
				+        
			
 
				+
			
 
				+    # 根据词频倒序排列
			
 
				+    logging.info("根据词频进行倒序排列")
			
 
				+    sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
			
 
				+
			
 
				+    logging.info("分词操作并进行词频统计 结束")
			
 
				+
			
 
				+    return sorted_key_list
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    # 日志初始化
			
 
				+    tools.init_log()
			
 
				+
			
 
				+    tools.log_start_msg(TITLE)
			
 
				+
			
 
				+    if not os.path.exists(DATA_FILE):
			
 
				+        logging.warning("待处理的数据文件不存在：%s" % DATA_FILE)
			
 
				+        return
			
 
				+
			
 
				+    # 读取数据
			
 
				+    logging.info("正在读取待处理的数据文件：%s" % DATA_FILE)
			
 
				+    lines = None
			
 
				+    with open(DATA_FILE, "r", encoding=config.ENCODING_CHARSET) as f:
			
 
				+        lines = f.readlines()
			
 
				+    
			
 
				+    # 执行分词和词频统计（跳过前两行）
			
 
				+    word_root_list = cut_word_and_statistics(lines[2:])
			
 
				+
			
 
				+    # 导出数据
			
 
				+    logging.info("正在导出分词数据，位置：%s" % config.CUT_FILE)
			
 
				+    with open(config.CUT_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
			
 
				+        for key, count in word_root_list:
			
 
				+            f.write("%s,%d\n" % (key, count))
			
 
				+
			
 
				+    tools.log_end_msg(TITLE)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+
			
--- a/cut_multiprocess.py
+++ b/cut_multiprocess.py
@@ -1,228 +0,0 @@
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-import datetime
			
 
				-import os
			
 
				-import math
			
 
				-import pickle
			
 
				-from time import sleep
			
 
				-import jieba
			
 
				-from multiprocessing import Process, Manager
			
 
				-
			
 
				-from zmq import QUEUE
			
 
				-
			
 
				-# TODO
			
 
				-# 1. 研究jieba多进程切词(windows上无法使用自带的多进程切词功能)
			
 
				-# 2. 进一步减少断点保存的次数（即调整保存间隔，或者全内存中）
			
 
				-# 3. 加入在分词完成后保存结果以防丢失
			
 
				-# 4. 在每个进程中分别保存分词结果，然后再统一合并
			
 
				-
			
 
				-
			
 
				-
			
 
				-# 待处理的数据文件路径
			
 
				-DATA_FILE = './data/合并结果.txt'
			
 
				-
			
 
				-# 分词保存
			
 
				-CUT_OUTPUT_FILE = './data/分词结果.txt'
			
 
				-
			
 
				-# 是否分词结束后保存结果
			
 
				-IS_ASSUME_TOTAL = True
			
 
				-
			
 
				-# 是否断点续存
			
 
				-IS_ASSUME = False
			
 
				-
			
 
				-# 是否测试模式
			
 
				-IS_TEST_MODE = False
			
 
				-
			
 
				-# 测试使用的数据量
			
 
				-TEST_DATA_NUM = 100 * 10000
			
 
				-
			
 
				-# 测试模式时断点续存的保存间隔
			
 
				-TEST_SAVE_INTERNAL = 200
			
 
				-
			
 
				-# 编码
			
 
				-ENCODING_CHARSET = "UTF-8"
			
 
				-
			
 
				-# 配置文件路径
			
 
				-CONFIG_PATH = "./data/pkl/cut_config_%d.pkl"
			
 
				-
			
 
				-# 处理进程数量
			
 
				-PROCESS_NUM = os.cpu_count()
			
 
				-
			
 
				-# 保存间隔（多久保存一次）
			
 
				-SAVE_INTERNAL = TEST_SAVE_INTERNAL if IS_TEST_MODE else 1000000
			
 
				-
			
 
				-# 处理进度提醒间隔
			
 
				-PROCESS_TIPS_INTERNAL = 10 * 10000
			
 
				-
			
 
				-def save_config(config_path, config_obj):
			
 
				-    """
			
 
				-    保存配置文件
			
 
				-    """
			
 
				-    with open(config_path, "wb") as f:
			
 
				-        pickle.dump(config_obj, f)
			
 
				-
			
 
				-
			
 
				-def load_config(config_path):
			
 
				-    """
			
 
				-    加载配置文件
			
 
				-    """
			
 
				-    with open(config_path, "rb") as f:
			
 
				-        return pickle.load(f)
			
 
				-
			
 
				-def cut_word(word):
			
 
				-    """
			
 
				-    分词
			
 
				-    """
			
 
				-    word_root = jieba.cut_for_search(word)
			
 
				-    return list(word_root)
			
 
				-
			
 
				-def multiprocess_cut_word(process_name, data_list, result_dict, config_path, cut_config):
			
 
				-
			
 
				-    """
			
 
				-    多进程进行分词处理
			
 
				-    """
			
 
				-
			
 
				-    print('进程：%s -> 分词处理开始' % process_name)
			
 
				-
			
 
				-    if (IS_ASSUME_TOTAL or IS_ASSUME) and os.path.exists(config_path) :
			
 
				-        cut_config = load_config(config_path)
			
 
				-        print("进程：%s -> 进断点恢复 当前状态：%s，开始处理位置：%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
			
 
				-
			
 
				-    if cut_config['state'] == 'run':
			
 
				-
			
 
				-        # 获取带分词的数据
			
 
				-        lines = data_list[cut_config['current_pos']:cut_config['end_pos']]
			
 
				-
			
 
				-        # 统计需要处理的数据量
			
 
				-        total_num = len(lines)
			
 
				-        print("进程：%s ->剩余待处理数量：%d" % (process_name, total_num))
			
 
				-
			
 
				-        for i, line in enumerate(lines):
			
 
				-            # 数据处理
			
 
				-            line = line.replace("\n", "")
			
 
				-            # 分词
			
 
				-            cut_config["word_dict"][line]=cut_word(line)
			
 
				-
			
 
				-            # 断点保存
			
 
				-            if IS_ASSUME and i > 0 and i % SAVE_INTERNAL == 0:
			
 
				-                cut_config["current_pos"] = cut_config["current_pos"] + SAVE_INTERNAL
			
 
				-                print("进程：%s -> 断点保存 当前状态：%s，当前处理位置：%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
			
 
				-                save_config(config_path, cut_config)
			
 
				-            
			
 
				-            # 处理进度提示
			
 
				-            if i > 0 and i % PROCESS_TIPS_INTERNAL == 0:
			
 
				-                print("进程：%s -> 当前处理进度：%d / %d" % (process_name, i, total_num))
			
 
				-
			
 
				-        # 最终结果保存
			
 
				-        if IS_ASSUME_TOTAL or IS_ASSUME:
			
 
				-            print("进程：%s -> 保存最终的分词结果" % process_name)
			
 
				-            cut_config["state"] = "end"
			
 
				-            cut_config["current_pos"] = cut_config['end_pos']
			
 
				-            save_config(config_path, cut_config)
			
 
				-        
			
 
				-        # result_dict.update(cut_config["word_dict"])
			
 
				-        result_dict[process_name]=cut_config["word_dict"]
			
 
				-
			
 
				-        print('进程：%s -> 分词处理结束' % process_name)
			
 
				-    else :
			
 
				-        # result_dict.update(cut_config['word_dict'])
			
 
				-        result_dict[process_name]=cut_config["word_dict"]
			
 
				-        print('进程：%s -> 断点恢复，分词处理结束' % process_name)
			
 
				-
			
 
				-def main():
			
 
				-
			
 
				-    print("开始时间：", datetime.datetime.now())
			
 
				-
			
 
				-    print("配置：启动用断点续存，保存间隔：%d" % SAVE_INTERNAL if IS_ASSUME else "配置：不启用断点续存")
			
 
				-    print("配置：保存最终的分词结果" if IS_ASSUME_TOTAL else "配置：不保存最终的分词结果")
			
 
				-
			
 
				-    # 处理进程容器
			
 
				-    process_list = []
			
 
				-    # 配置文件容器
			
 
				-    config_list = []
			
 
				-
			
 
				-    # 设置多进程共享变量
			
 
				-    manager = Manager()
			
 
				-    # 多进程共享的数据源
			
 
				-    global_list = manager.list()
			
 
				-    # 多进程返回的结果
			
 
				-    result_dict = manager.dict()
			
 
				-
			
 
				-    print("加载数据")
			
 
				-    with open(DATA_FILE, "r", encoding="UTF-8") as f:
			
 
				-        if IS_TEST_MODE:
			
 
				-            print("当前处于测试模式，测试数据量：%d" % TEST_DATA_NUM)
			
 
				-            global_list.extend(f.readlines()[:TEST_DATA_NUM])
			
 
				-        else:
			
 
				-            global_list.extend(f.readlines())
			
 
				-    
			
 
				-    total_len = len(global_list)
			
 
				-    count = math.ceil(total_len / PROCESS_NUM)
			
 
				-    print("待处理总数量：%d, 数量区间：%d" % (total_len, count))
			
 
				-
			
 
				-    # 构造配置
			
 
				-    for i in range(PROCESS_NUM):
			
 
				-        start_pos = i * count
			
 
				-        end_pos = i * count + count
			
 
				-        if end_pos >= total_len :
			
 
				-            end_pos = -1
			
 
				-        cut_config = {
			
 
				-            "state": "run",
			
 
				-            "start_pos": start_pos,
			
 
				-            "current_pos": start_pos,
			
 
				-            "end_pos": end_pos,
			
 
				-            "word_dict": {}
			
 
				-        }
			
 
				-        config_list.append(cut_config)
			
 
				-
			
 
				-    print("配置", config_list)
			
 
				-
			
 
				-    for i, config in enumerate(config_list):
			
 
				-        p = Process(target=multiprocess_cut_word, args=("进程-%d" % i, global_list, result_dict, CONFIG_PATH % i, config))
			
 
				-        p.start()
			
 
				-        process_list.append(p)
			
 
				-
			
 
				-    for p in process_list:
			
 
				-        p.join()
			
 
				-
			
 
				-    print("合并最终的分词结果：开始")
			
 
				-
			
 
				-    result = []
			
 
				-    print("处理成list便于写入文件")
			
 
				-    for (process_name, word_dict) in result_dict.items():
			
 
				-        tmp = None
			
 
				-        for (key, value) in word_dict.items():
			
 
				-            tmp = ["%s,%s\n" % (key, value) for (key, value) in word_dict.items() ]
			
 
				-        result.extend(tmp)
			
 
				-    print("写入文件")
			
 
				-    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
			
 
				-        f.writelines(result)
			
 
				-    # with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
			
 
				-    #     for (process_name, word_dict) in result_dict.items():
			
 
				-    #         for (key, value) in word_dict.items():
			
 
				-    #             f.write("%s,%s\n" % (key, value))
			
 
				-    #             # f.write("\n")
			
 
				-    print("合并最终的分词结果：结束")
			
 
				-
			
 
				-    print("结束时间：", datetime.datetime.now())
			
 
				-
			
 
				-def main2():
			
 
				-    print("开始时间：", datetime.datetime.now())
			
 
				-
			
 
				-    with open(CUT_OUTPUT_FILE, "a", encoding=ENCODING_CHARSET) as f:
			
 
				-        for i in range(4):
			
 
				-            config_p = CONFIG_PATH % i
			
 
				-            print("时间：%s， 读取：%s —— 开始" % (datetime.datetime.now(), config_p))
			
 
				-            config = load_config(config_p)
			
 
				-            print("时间：%s， 读取：%s —— 结束" % (datetime.datetime.now(), config_p))
			
 
				-
			
 
				-            print("时间：%s，写入文件 -- 开始"% datetime.datetime.now())
			
 
				-            for (key, value) in config["word_dict"].items():
			
 
				-                f.write("%s,%s\n" % (key, value))
			
 
				-            print("时间：%s，写入文件 -- 结束"% datetime.datetime.now())
			
 
				-
			
 
				-    print("结束时间：", datetime.datetime.now())
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main2()
			
--- a/cut_multiprocess2.py
+++ b/cut_multiprocess2.py
@@ -1,176 +0,0 @@
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-import datetime
			
 
				-import os
			
 
				-import math
			
 
				-import jieba
			
 
				-from multiprocessing import Process, Manager
			
 
				-
			
 
				-
			
 
				-# 待处理的数据文件路径
			
 
				-DATA_FILE = './data/合并结果.txt'
			
 
				-
			
 
				-# 分词保存
			
 
				-CUT_OUTPUT_FILE = './data/分词结果.txt'
			
 
				-
			
 
				-# 消费者进程数量
			
 
				-CONSUMER_NUM = 1
			
 
				-
			
 
				-# 生产者进程数量
			
 
				-# PRODUCER_NUM = os.cpu_count() - CONSUMER_NUM
			
 
				-PRODUCER_NUM = 1
			
 
				-
			
 
				-# 是否测试模式
			
 
				-IS_TEST_MODE = False
			
 
				-
			
 
				-# 测试使用的数据量
			
 
				-TEST_DATA_NUM = 100 * 10000
			
 
				-
			
 
				-# 编码
			
 
				-ENCODING_CHARSET = "UTF-8"
			
 
				-
			
 
				-# 发送至消息队列的间隔
			
 
				-SEND_INTERNAL = 1 * 10000
			
 
				-
			
 
				-# 处理进度提醒间隔
			
 
				-PROCESS_TIPS_INTERNAL = 10 * 10000
			
 
				-
			
 
				-
			
 
				-def cut_word(word):
			
 
				-    """
			
 
				-    分词
			
 
				-    """
			
 
				-    word_root = jieba.cut_for_search(word)
			
 
				-    return list(word_root)
			
 
				-
			
 
				-def consumer(queue):
			
 
				-    """
			
 
				-    消费者，把数据保存在指定位置
			
 
				-    """
			
 
				-
			
 
				-    print("消费者：启动")
			
 
				-
			
 
				-    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
			
 
				-        
			
 
				-        while True:
			
 
				-
			
 
				-            msg = queue.get()
			
 
				-
			
 
				-            if "quit" == msg.get("command"):
			
 
				-                print("消费者：接收到结束命令")
			
 
				-                break
			
 
				-
			
 
				-            if len(msg['payload']) > 0:
			
 
				-                for item in msg['payload']:
			
 
				-                    f.write("%s,%s\n" % (item['key'], item['value']))
			
 
				-    
			
 
				-    print("消费者：结束")
			
 
				-
			
 
				-def producer(data, queue, config):
			
 
				-    """
			
 
				-    多进程进行分词处理
			
 
				-    """
			
 
				-
			
 
				-    process_name = config['process_name']
			
 
				-
			
 
				-    print('进程：%s -> 分词处理开始' % process_name)
			
 
				-
			
 
				-    # 获取待分词的数据
			
 
				-    lines = data[config['current_pos']:config['end_pos']]
			
 
				-
			
 
				-    # 统计需要处理的数据量
			
 
				-    total_num = len(lines)
			
 
				-    print("进程：%s ->剩余待处理数量：%d" % (process_name, total_num))
			
 
				-
			
 
				-    msg_content = {
			
 
				-        'payload': []
			
 
				-    }
			
 
				-
			
 
				-    for i, line in enumerate(lines):
			
 
				-        # 数据处理
			
 
				-        line = line.replace("\n", "")
			
 
				-        # 分词
			
 
				-        word_root = cut_word(line)
			
 
				-        # 
			
 
				-        msg_content['payload'].append({"key": line, "value": word_root})
			
 
				-        
			
 
				-        if len(msg_content) >= SEND_INTERNAL:
			
 
				-            queue.put(msg_content)
			
 
				-            msg_content = {
			
 
				-                'payload': []
			
 
				-            }
			
 
				-        # 处理进度提示
			
 
				-        if i > 0 and i % PROCESS_TIPS_INTERNAL == 0:
			
 
				-            print("进程：%s -> 当前处理进度：%d / %d" % (process_name, i, total_num))
			
 
				-    
			
 
				-    queue.put(msg_content)
			
 
				-
			
 
				-    print('进程：%s -> 分词处理结束' % process_name)
			
 
				-
			
 
				-def main():
			
 
				-
			
 
				-    print("开始时间：", datetime.datetime.now())
			
 
				-
			
 
				-    # 处理进程容器
			
 
				-    process_list = []
			
 
				-    # 配置文件容器
			
 
				-    config_list = []
			
 
				-
			
 
				-    # 设置多进程共享变量
			
 
				-    manager = Manager()
			
 
				-    # 多进程共享的数据源
			
 
				-    global_list = manager.list()
			
 
				-    # 多进程通信队列
			
 
				-    global_queue = manager.Queue()
			
 
				-
			
 
				-    print("加载数据")
			
 
				-    with open(DATA_FILE, "r", encoding="UTF-8") as f:
			
 
				-        if IS_TEST_MODE:
			
 
				-            print("当前处于测试模式，测试数据量：%d" % TEST_DATA_NUM)
			
 
				-            global_list.extend(f.readlines()[:TEST_DATA_NUM])
			
 
				-        else:
			
 
				-            global_list.extend(f.readlines())
			
 
				-    
			
 
				-    total_len = len(global_list)
			
 
				-    count = math.ceil(total_len / PRODUCER_NUM)
			
 
				-    print("待处理总数量：%d, 数量区间：%d" % (total_len, count))
			
 
				-
			
 
				-    # 构造配置
			
 
				-    for i in range(PRODUCER_NUM):
			
 
				-        start_pos = i * count
			
 
				-        end_pos = i * count + count
			
 
				-        if end_pos >= total_len :
			
 
				-            end_pos = None
			
 
				-        cut_config = {
			
 
				-            "start_pos": start_pos,
			
 
				-            "current_pos": start_pos,
			
 
				-            "end_pos": end_pos,
			
 
				-            "process_name": "线程-%d" % i
			
 
				-        }
			
 
				-        config_list.append(cut_config)
			
 
				-
			
 
				-    print("配置", config_list)
			
 
				-
			
 
				-    # 启动消费者
			
 
				-    cosumer = Process(target=consumer, args=(global_queue,))
			
 
				-    cosumer.start()
			
 
				-
			
 
				-    # 启动生产者
			
 
				-    for i, config in enumerate(config_list):
			
 
				-        p = Process(target=producer, args=(global_list, global_queue, config))
			
 
				-        p.start()
			
 
				-        process_list.append(p)
			
 
				-
			
 
				-    for p in process_list:
			
 
				-        p.join()
			
 
				-    
			
 
				-    # 给消费者发送结束指令
			
 
				-    global_queue.put({"command":"quit"})
			
 
				-    # 等待消费者结束执行
			
 
				-    cosumer.join()
			
 
				-
			
 
				-    print("结束时间：", datetime.datetime.now())
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/cut_statistics.py
+++ b/cut_statistics.py
@@ -1,147 +0,0 @@
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-import os
			
 
				-import jieba
			
 
				-
			
 
				-# 待处理的数据文件
			
 
				-DATA_KEYWORD_FILE = "E:\Download\怎么长尾词_1655561719.csv"
			
 
				-
			
 
				-# 输出的结果文件
			
 
				-CUT_OUTPUT_FILE = "./data/分词与词频统计结果.csv"
			
 
				-
			
 
				-# 文件编码格式
			
 
				-ENCODING_CHARSET = "UTF-8"
			
 
				-
			
 
				-# 停用词
			
 
				-STOP_WORD_DIR = "./data/stopwords"
			
 
				-
			
 
				-# 间隔进度提示
			
 
				-INTERNAL_NUM = 50000
			
 
				-
			
 
				-def cut_word_and_statistics(data):
			
 
				-
			
 
				-    """
			
 
				-    分词并统计词频
			
 
				-    """
			
 
				-
			
 
				-    print("开始执行分词操作并进行词频统计")
			
 
				-
			
 
				-    total_num = len(data)
			
 
				-    print("共需处理数据：%d" % total_num)
			
 
				-
			
 
				-    # 分词结果容器
			
 
				-    key_dict = {}
			
 
				-
			
 
				-    # 跳过开头两行
			
 
				-    for i, item in enumerate(data):
			
 
				-        # 只需要第一列的数据
			
 
				-        longTailKey = item.split(",")[0]
			
 
				-
			
 
				-        longTailKey = longTailKey.replace("\n", "")
			
 
				-        
			
 
				-        # 分词
			
 
				-        cutWord = jieba.cut_for_search(longTailKey)
			
 
				-
			
 
				-        # 统计
			
 
				-        for word in cutWord:
			
 
				-            if word in key_dict:
			
 
				-                key_dict[word] = key_dict[word] + 1
			
 
				-            else:
			
 
				-                key_dict[word] = 1
			
 
				-        
			
 
				-        # 进度提示
			
 
				-        if i % INTERNAL_NUM == 0:
			
 
				-            print("当前分词进度 %d / %d" % (i, total_num))
			
 
				-
			
 
				-    print("根据词频进行倒序排列")
			
 
				-
			
 
				-    # 根据词频倒序排列
			
 
				-    sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
			
 
				-
			
 
				-    print("分词结束")
			
 
				-
			
 
				-    return sorted_key_dict
			
 
				-
			
 
				-
			
 
				-def load_stop_words():
			
 
				-    """
			
 
				-    加载停用词列表
			
 
				-    """
			
 
				-    
			
 
				-    print("加载停用词 - 开始")
			
 
				-
			
 
				-    # 停用词容器
			
 
				-    stop_word = []
			
 
				-
			
 
				-    stop_word_files = os.listdir(STOP_WORD_DIR)
			
 
				-
			
 
				-    for file in stop_word_files:
			
 
				-        stop_word_file = os.path.join(STOP_WORD_DIR, file)
			
 
				-        with open(stop_word_file, encoding=ENCODING_CHARSET) as f:
			
 
				-            for item in f:
			
 
				-                stop_word.append(item.replace("\n",""))
			
 
				-
			
 
				-    print("去重前，停用词数量：", len(stop_word))
			
 
				-
			
 
				-    stop_word = list(set(stop_word))
			
 
				-
			
 
				-    print("去重后，停用词数量：", len(stop_word))
			
 
				-
			
 
				-    print("加载停用词 - 结束")
			
 
				-
			
 
				-    return stop_word
			
 
				-
			
 
				-
			
 
				-def filter_stop_word(word_root: dict) :
			
 
				-    """
			
 
				-    对分词结果过滤停用词
			
 
				-    """
			
 
				-
			
 
				-    print("过滤停用词 - 开始")
			
 
				-
			
 
				-    # 加载停用词
			
 
				-    stop_word = load_stop_words()
			
 
				-    
			
 
				-    print("过滤前，总分词数量：%d" % len(word_root))
			
 
				-
			
 
				-    # 过滤停用词
			
 
				-    word_root_filter = dict((key, value) for key , value in word_root if key not in stop_word)
			
 
				-
			
 
				-    print("过滤后，总分词数量：%d" % len(word_root_filter))
			
 
				-
			
 
				-    print("过滤停用词 - 结束")
			
 
				-
			
 
				-    return word_root_filter
			
 
				-
			
 
				-def main():
			
 
				-
			
 
				-    print("开始")
			
 
				-
			
 
				-    if not os.path.exists(DATA_KEYWORD_FILE):
			
 
				-        raise Exception("待处理的数据文件不存在：%s" % DATA_KEYWORD_FILE)
			
 
				-
			
 
				-    # 读取数据
			
 
				-    print("从待处理的数据文件中读取数据")
			
 
				-    lines = None
			
 
				-    with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f:
			
 
				-        lines = f.readlines()
			
 
				-    
			
 
				-    # 执行分词和词频统计
			
 
				-    word_root = cut_word_and_statistics(lines[:100])
			
 
				-
			
 
				-    # 过滤停用词
			
 
				-    word_root_filter = filter_stop_word(word_root)
			
 
				-        
			
 
				-    # 导出过滤后的数据，不要表头和行号
			
 
				-    # print("导出过滤后的结果")
			
 
				-    # word_root_dataframe.to_csv(CUT_OUTPUT_FILE, header=False, index=False)
			
 
				-    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
			
 
				-        for item in word_root_filter.items():
			
 
				-            f.write("%s,%d\n" % item)
			
 
				-
			
 
				-    print("结束")
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
 
				-
			
--- a/data/category/白凉粉是什么东西.txt
+++ b/data/category/白凉粉是什么东西.txt
@@ -1,185 +0,0 @@
 
				-白凉粉是什么东西
			
 
				-
			
 
				-铁粉是什么东西
			
 
				-白带是什么东西
			
 
				-粉粉是什么东西
			
 
				-什么东西是白的
			
 
				-白膜是什么东西
			
 
				-黄白是什么东西
			
 
				-果粉是什么东西
			
 
				-粉刺是什么东西
			
 
				-白莲是什么东西
			
 
				-血粉是什么东西
			
 
				-白银是什么东西
			
 
				-蛋白是什么东西
			
 
				-白土是什么东西
			
 
				-臭粉是什么东西
			
 
				-鸡粉是什么东西
			
 
				-淀粉是什么东西
			
 
				-蛋白粉是什么东西
			
 
				-钛白粉什么东西
			
 
				-粉虫是什么东西
			
 
				-冰粉是什么东西
			
 
				-生粉是什么东西
			
 
				-瑞粉是什么东西
			
 
				-粉尘是什么东西
			
 
				-太粉是什么东西
			
 
				-食粉是什么东西
			
 
				-海粉是什么东西
			
 
				-白胶是什么东西
			
 
				-太白粉是什么东西
			
 
				-白术是什么东西
			
 
				-白粮是什么东西
			
 
				-白果是什么东西
			
 
				-粉贝是什么东西
			
 
				-白子是什么东西
			
 
				-肠粉是什么东西
			
 
				-葱白是什么东西
			
 
				-白洞是什么东西
			
 
				-藕粉是什么东西
			
 
				-茭白是什么东西
			
 
				-干粉是什么东西
			
 
				-糖粉是什么东西
			
 
				-水粉是什么东西
			
 
				-鱼白是什么东西
			
 
				-磷粉是什么东西
			
 
				-口白是什么东西
			
 
				-菊粉是什么东西
			
 
				-鞋粉是什么东西
			
 
				-澄粉是什么东西
			
 
				-白柑是什么东西
			
 
				-白矾是什么东西
			
 
				-牙粉是什么东西
			
 
				-粉葛是什么东西
			
 
				-什么东西是白色
			
 
				-粉刺白色东西是什么
			
 
				-黑粉是什么东西
			
 
				-面粉是什么东西
			
 
				-太白粉水是什么东西
			
 
				-粉嫩是什么东西
			
 
				-白薯是什么东西
			
 
				-石粉是什么东西
			
 
				-凉薯是什么东西
			
 
				-肽粉是什么东西
			
 
				-粉瘤是什么东西
			
 
				-粉扑是什么东西
			
 
				-麻粉是什么东西
			
 
				-白芷是什么东西
			
 
				-白练是什么东西
			
 
				-凉粉是什么东西做的
			
 
				-醋白是什么东西
			
 
				-矿粉是什么东西
			
 
				-钼粉是什么东西
			
 
				-凉是什么东西
			
 
				-粉末是什么东西
			
 
				-结粉是什么东西
			
 
				-白酱是什么东西
			
 
				-白浆是什么东西
			
 
				-烟粉是什么东西
			
 
				-薤白是什么东西
			
 
				-白凉粉是什么东西做的
			
 
				-什么东西是米白
			
 
				-钛白粉是什么东西
			
 
				-小粉是什么东西
			
 
				-粉沫是什么东西
			
 
				-墨粉是什么东西
			
 
				-什么东西是粉
			
 
				-白条是什么东西
			
 
				-白虎是什么东西
			
 
				-凉粉是什么东西
			
 
				-花粉是什么东西
			
 
				-海白是什么东西
			
 
				-凉霸是什么东西
			
 
				-什么东西是凉的
			
 
				-骨粉是什么东西
			
 
				-白点是什么东西
			
 
				-粉云是什么东西
			
 
				-什么东西是黑白
			
 
				-草粉是什么东西
			
 
				-药粉是什么东西
			
 
				-什么东西是凉性
			
 
				-白水是什么东西
			
 
				-金粉是什么东西
			
 
				-白斑是什么东西
			
 
				-云粉是什么东西
			
 
				-发粉是什么东西
			
 
				-菌粉是什么东西
			
 
				-炸粉是什么东西
			
 
				-白沫是什么东西
			
 
				-根粉是什么东西
			
 
				-蜜粉是什么东西
			
 
				-丁粉是什么东西
			
 
				-粉底是什么东西
			
 
				-粉姜是什么东西
			
 
				-白宝是什么东西
			
 
				-粉画是什么东西
			
 
				-白油是什么东西
			
 
				-奥粉是什么东西
			
 
				-墙粉是什么东西
			
 
				-白羊是什么东西
			
 
				-牛粉是什么东西
			
 
				-白云是什么东西
			
 
				-羊粉是什么东西
			
 
				-白盐是什么东西
			
 
				-散粉是什么东西
			
 
				-鱼粉是什么东西
			
 
				-上白面粉是什么东西
			
 
				-白灰是什么东西
			
 
				-铜粉是什么东西
			
 
				-白蜡是什么东西
			
 
				-粉饼是什么东西
			
 
				-白碱是什么东西
			
 
				-饼粉是什么东西
			
 
				-葛粉是什么东西
			
 
				-铅粉是什么东西
			
 
				-羊白是什么东西
			
 
				-铑粉是什么东西
			
 
				-该粉是什么东西
			
 
				-白泥是什么东西
			
 
				-什么东西是白心
			
 
				-蛋白粉是些什么东西
			
 
				-什么东西是凉肠
			
 
				-白霍是什么东西
			
 
				-白榉是什么东西
			
 
				-白菌是什么东西
			
 
				-干的白凉粉是什么东西
			
 
				-凉粉熬的是什么东西
			
 
				-隥粉是什么东西
			
 
				-粉醇是什么东西
			
 
				-凉粉是什么东西做了
			
 
				-老凉粉是什么东西
			
 
				-粉欧是什么东西
			
 
				-食用的白凉粉是什么东西
			
 
				-直粉是什么东西
			
 
				-白望是什么东西
			
 
				-白凉粉是什么东西制做的
			
 
				-白氿是什么东西
			
 
				-奶白是什么东西
			
 
				-肝蛋白粉是什么东西
			
 
				-花生凉粉是什么东西
			
 
				-奴粉是什么东西
			
 
				-白宇是什么东西
			
 
				-白粉学校是什么东西
			
 
				-粉贴是什么东西
			
 
				-5白是什么东西
			
 
				-凉粉浆是什么东西
			
 
				-铲粉是什么东西
			
 
				-白凉粉是什么东西怎么用
			
 
				-白腿是什么东西
			
 
				-梨白粉病是什么东西
			
 
				-白塘是什么东西
			
 
				-请问白凉粉是什么东西做的
			
 
				-罩粉是什么东西
			
 
				-白醋是什么东西
			
 
				-炸东西粉是什么
			
 
				-米粉是什么东西
			
 
				-芯粉是什么东西
			
 
				-白糠是什么东西
			
 
				-s粉是什么东西
			
 
				-白色粉刺是什么东西
			
 
				-果冻白凉粉是什么东西啊
			
 
				-吸的白粉是什么东西
			
 
				-鸟粉是什么东西
			
 
				-虎粉是什么东西
			
 
				-白色粉圆是什么东西
			
 
				-肩粉是什么东西
			
--- a/data/category/腋下长了一个小疙瘩是什么东西.txt
+++ b/data/category/腋下长了一个小疙瘩是什么东西.txt
@@ -1,44 +0,0 @@
 
				-腋下长了一个小疙瘩是什么东西
			
 
				-
			
 
				-下巴长了一个东西是什么
			
 
				-腋下长个东西是什么
			
 
				-腋下长了一个小肉芽什么东西
			
 
				-腋下长了个大疙瘩很疼是什么东西
			
 
				-食指下面长了一个疙瘩是什么东西
			
 
				-腋下长了小疙瘩是什么东西不疼
			
 
				-下嘴唇长了个疙瘩是什么东西
			
 
				-腋下长一小疙瘩很痛是什么东西
			
 
				-腋下长了个坨是什么东西
			
 
				-腋下痒长了一个疙瘩是什么东西
			
 
				-脖子长了一个小疙瘩是什么东西
			
 
				-腋下长了一个小肉球什么东西
			
 
				-腋下长了个小软块是什么东西
			
 
				-脚上长了个小疙瘩是什么东西
			
 
				-腋下长几个小疙瘩是什么东西
			
 
				-手臂长了一个小疙瘩是什么东西
			
 
				-腋下长小红疙瘩是什么东西
			
 
				-舌头长了个小疙瘩是什么东西
			
 
				-腋下长了一个小长条是什么东西
			
 
				-肋骨下面长了一个疙瘩是什么东西
			
 
				-腋下长了一个圆疙瘩是什么东西
			
 
				-脚根部长了一个小疙瘩是什么东西
			
 
				-下巴长了个疙瘩是什么东西
			
 
				-腋窝前长了一个小疙瘩是什么东西
			
 
				-腋下偶尔长一个小疙瘩是什么东西
			
 
				-副乳长了一个小疙瘩是什么东西
			
 
				-腋下长了一条小疙瘩是什么东西
			
 
				-腋下长了一个小硬包是什么东西
			
 
				-腋下皮下长了小疙瘩是什么东西
			
 
				-腋下长了一个小黑疙瘩是什么东西
			
 
				-腋窝里面长了个小疙瘩是什么东西
			
 
				-腋下长了几个小疙瘩什么东西
			
 
				-腋下长了一团小疙瘩是什么东西
			
 
				-胳肢窝长了一个小疙瘩是什么东西
			
 
				-胸上长了一个小疙瘩是什么东西
			
 
				-腋下长了一片红小疙瘩是什么东西
			
 
				-腋下长了几个硬疙瘩是什么东西
			
 
				-腋下长了几个个小疙瘩是什么东西
			
 
				-下面长了个小疙瘩是什么东西
			
 
				-腋下长了一些小红疙瘩是什么东西
			
 
				-腋下长了一串小疙瘩是什么东西
			
 
				-腋下长了个小疙瘩是什么原因
			
--- a/key.py
+++ b/key.py
@@ -0,0 +1,52 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import logging
			
 
				+import config
			
 
				+import tools
			
 
				+import jieba
			
 
				+import datetime
			
 
				+import mmap
			
 
				+
			
 
				+TITLE = "关键词表 生成"
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    
			
 
				+    # 日志配置初始化
			
 
				+    tools.init_log()
			
 
				+    tools.log_start_msg(TITLE)
			
 
				+
			
 
				+    with open(config.MERGE_FILE, "r", encoding=config.ENCODING_CHARSET) as fmerge, \
			
 
				+        open(config.KEY_FILE, "w", encoding=config.ENCODING_CHARSET) as fw, \
			
 
				+        mmap.mmap(fmerge.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
			
 
				+
			
 
				+        # TODO 
			
 
				+        # 这里可能有IO优化的余地
			
 
				+        # 这里可以不用mmap，改用一条一条readline()进行读取
			
 
				+        # 进度提示也不完整
			
 
				+
			
 
				+            count = -1
			
 
				+            total_num = fmmap.size()
			
 
				+
			
 
				+            while True:
			
 
				+                count = count + 1
			
 
				+                # 读取关键词
			
 
				+                word = fmmap.readline().decode("UTF-8").replace("\r","").replace("\n","")
			
 
				+
			
 
				+                # 读取不到任何内容结束执行
			
 
				+                if not word :
			
 
				+                    break
			
 
				+                
			
 
				+                # 分词
			
 
				+                word_root = list(jieba.cut_for_search(word))
			
 
				+
			
 
				+                # 写入文件中
			
 
				+                fw.write("%d,%s,%s\n"%(count,word,word_root))
			
 
				+
			
 
				+                # 进度提示
			
 
				+                tools.tip(total_num, fmmap.tell(), False)
			
 
				+
			
 
				+    tools.log_end_msg(TITLE)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/key_index.py
+++ b/key_index.py
@@ -0,0 +1,58 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import config
			
 
				+import tools
			
 
				+import mmap
			
 
				+import logging
			
 
				+
			
 
				+TITLE = "关键词索引"
			
 
				+
			
 
				+def main():
			
 
				+    # 日志配置初始化
			
 
				+    tools.init_log()
			
 
				+    tools.log_start_msg(TITLE)
			
 
				+
			
 
				+    # 关键词索引容器
			
 
				+    key_index_cache = {}
			
 
				+
			
 
				+    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
			
 
				+        mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
			
 
				+        
			
 
				+        # 总大小
			
 
				+        total_num = fmmap.size()
			
 
				+
			
 
				+        while True:
			
 
				+            # 读取光标位置
			
 
				+            cur_pos = fmmap.tell()
			
 
				+            # 把光标移动到下一行
			
 
				+            line = fmmap.readline().decode(config.ENCODING_CHARSET)
			
 
				+            # 如果没有数据则结束
			
 
				+            if not line :
			
 
				+                break
			
 
				+            
			
 
				+            # 获取关键词序号
			
 
				+            index = line.index(",")
			
 
				+
			
 
				+            # 建立关键词序号和位置的关系
			
 
				+            key_index_cache[line[:index]]=cur_pos
			
 
				+            
			
 
				+            # 进度显示
			
 
				+            tools.tip_in_size(total_num, cur_pos)
			
 
				+        
			
 
				+        # 保存索引
			
 
				+        tools.save_obj(config.KEY_INDEX_CACHE, key_index_cache)
			
 
				+
			
 
				+    tools.log_end_msg(TITLE)
			
 
				+    
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+
			
 
				+    # key_index_cache = tools.load_obj(config.KEY_INDEX_CACHE)
			
 
				+
			
 
				+    # with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
			
 
				+    #     mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
			
 
				+    #         for key,value in key_index_cache.items():
			
 
				+    #             fmmap.seek(value)
			
 
				+    #             line = fmmap.readline().decode(config.ENCODING_CHARSET)
			
 
				+    #             logging.debug("key: %s, value: %d, 内容：%s" % (key, value, line))
			
--- a/key_reverse.py
+++ b/key_reverse.py
@@ -0,0 +1,91 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import sys
			
 
				+from time import time
			
 
				+import os
			
 
				+import config
			
 
				+import tools
			
 
				+import ast
			
 
				+import re
			
 
				+import stop_word
			
 
				+
			
 
				+TITLE = "关键词倒排文件"
			
 
				+
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    构建待排表
			
 
				+    """
			
 
				+
			
 
				+    tools.init_log()
			
 
				+    tools.log_start_msg(TITLE)
			
 
				+
			
 
				+    # 提取规则
			
 
				+    s = r"(\d+),([^,]*),(.*)"
			
 
				+    pattern = re.compile(s, re.I)
			
 
				+
			
 
				+    # 倒排表 容器
			
 
				+    key_reverse = {}
			
 
				+
			
 
				+    # 停用表
			
 
				+    stop_word_cache = stop_word.load_stop_word()
			
 
				+
			
 
				+    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey:
			
 
				+        
			
 
				+        # 获取文件总大小，获取后需要复原光标位置
			
 
				+        fkey.seek(0, os.SEEK_END)
			
 
				+        total_num = fkey.tell()
			
 
				+        fkey.seek(0)
			
 
				+
			
 
				+        while True:
			
 
				+            # 获取当前处理位置
			
 
				+            cur_pos = fkey.tell()
			
 
				+            
			
 
				+            # 进度提示
			
 
				+            tools.tip_in_size(total_num, cur_pos)
			
 
				+
			
 
				+            # 读取关键词数据
			
 
				+            line = fkey.readline()
			
 
				+
			
 
				+            # 如果到行尾则结束
			
 
				+            if not line:
			
 
				+                break
			
 
				+
			
 
				+            # 提取数据
			
 
				+            m = pattern.match(line)
			
 
				+            # 获取关键词序号
			
 
				+            index = m.group(1)
			
 
				+            # 获取词根
			
 
				+            key_root = m.group(3)
			
 
				+            # 转换成真正的list对象
			
 
				+            for item in ast.literal_eval(key_root):
			
 
				+                
			
 
				+                # 排除停用词
			
 
				+                if item in stop_word_cache:
			
 
				+                    continue
			
 
				+
			
 
				+                # 构建倒排表
			
 
				+                val = key_reverse.get(item)
			
 
				+                if val:
			
 
				+                    key_reverse[item].append(index) 
			
 
				+                else:
			
 
				+                    key_reverse[item]=[]
			
 
				+                    key_reverse[item].append(index)
			
 
				+
			
 
				+    # 保存到本地文件
			
 
				+    with open(config.KEY_REVERSE_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
			
 
				+        for key, value in key_reverse.items():
			
 
				+            f.write("%s,%s\n" % (key, value))
			
 
				+
			
 
				+    tools.log_end_msg(TITLE)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
 
				+    # 测试加载耗时
			
 
				+    # start = time()
			
 
				+    # key_reverse_cache = tools.load_obj(config.KEY_REVERSE_CACHE)
			
 
				+    # end = time()
			
 
				+    # print("占用大小：", sys.getsizeof(key_reverse_cache))
			
 
				+    # print("加载耗时：", end-start)
			
--- a/key_reverse_index.py
+++ b/key_reverse_index.py
@@ -0,0 +1,57 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import sys
			
 
				+from time import time
			
 
				+import os
			
 
				+import config
			
 
				+import tools
			
 
				+import ast
			
 
				+import re
			
 
				+import stop_word
			
 
				+import mmap
			
 
				+
			
 
				+TITLE = "关键词倒排索引"
			
 
				+
			
 
				+def main():
			
 
				+    # 日志配置初始化
			
 
				+    tools.init_log()
			
 
				+    tools.log_start_msg(TITLE)
			
 
				+
			
 
				+    # 关键词倒排索引容器
			
 
				+    key_reverse_index_cache = {}
			
 
				+
			
 
				+    with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
			
 
				+        mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
			
 
				+        
			
 
				+        # 总大小
			
 
				+        total_num = fmmap.size()
			
 
				+
			
 
				+        while True:
			
 
				+            # 读取光标位置
			
 
				+            cur_pos = fmmap.tell()
			
 
				+            # 把光标移动到下一行
			
 
				+            line = fmmap.readline().decode(config.ENCODING_CHARSET)
			
 
				+
			
 
				+            # 如果没有数据则结束
			
 
				+            if not line :
			
 
				+                break
			
 
				+            
			
 
				+            # 获取词根位置，建立词根和位置的关系
			
 
				+            index = line.index(",")
			
 
				+            key_reverse_index_cache[line[:index]]=cur_pos
			
 
				+            
			
 
				+            # 进度显示
			
 
				+            tools.tip_in_size(total_num, cur_pos)
			
 
				+        
			
 
				+        # 保存索引
			
 
				+        tools.save_obj(config.KEY_REVERSE_INDEX_CACHE, key_reverse_index_cache)
			
 
				+
			
 
				+    tools.log_end_msg(TITLE)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # main()
			
 
				+    key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
			
 
				+    for i, item in enumerate(key_reverse_index_cache):
			
 
				+        if i > 10:
			
 
				+            break
			
 
				+        print(item)
			
--- a/logging.conf
+++ b/logging.conf
@@ -0,0 +1,28 @@
 
				+[loggers]
			
 
				+keys=root
			
 
				+
			
 
				+[handlers]
			
 
				+keys=fileHandler,consoleHandler
			
 
				+
			
 
				+[formatters]
			
 
				+keys=simpleFormatter
			
 
				+
			
 
				+[logger_root]
			
 
				+level=INFO
			
 
				+handlers=fileHandler,consoleHandler
			
 
				+
			
 
				+[handler_consoleHandler]
			
 
				+class=StreamHandler
			
 
				+args=(sys.stdout,)
			
 
				+level=DEBUG
			
 
				+formatter=simpleFormatter
			
 
				+
			
 
				+[handler_fileHandler]
			
 
				+class=FileHandler
			
 
				+args=('../all.log', 'a', "UTF-8")
			
 
				+level=DEBUG
			
 
				+formatter=simpleFormatter
			
 
				+
			
 
				+[formatter_simpleFormatter]
			
 
				+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
			
 
				+datefmt=
			
--- a/merge_cut_word.py
+++ b/merge_cut_word.py
@@ -1,20 +1,19 @@
 
				 # -*- coding: utf-8 -*-
			
 
				 
			
 
				+import config
			
 
				 import os
			
 
				+import tools
			
 
				+import logging
			
 
				 import zipfile
			
 
				 
			
 
				+TITLE= "拓展词合并"
			
 
				+
			
 
				 # 带合并的文件目录
			
 
				 DATA_DIR = "E:\Download\长尾关键词\普通-p"
			
 
				 
			
 
				-# 合并后输出文件
			
 
				-MERGE_OUTPUT_FILE = "./data/合并结果.txt"
			
 
				-
			
 
				-# 排除合并的文件
			
 
				-EXCLUDE_FILES = ['打开乱码如何处理？.txt']
			
 
				-
			
 
				 def get_files(path):
			
 
				     '''
			
 
				-    读取文件夹下的文件名称
			
 
				+    读取文件夹下的文件列表
			
 
				     '''
			
 
				     file_list = []
			
 
				     for file in os.listdir(path):
			
@@ -38,17 +37,19 @@ def merge_file_content():
 
				     ----------
			
 
				     """
			
 
				 
			
 
				-    print("----------- 开始 -----------")
			
 
				+    # 日志初始化
			
 
				+    tools.init_log()
			
 
				+
			
 
				+    tools.log_start_msg(TITLE)
			
 
				 
			
 
				     # 获取文件列表
			
 
				-    
			
 
				     files = get_files(DATA_DIR)
			
 
				 
			
 
				+    # 总文件数
			
 
				     total_num = len(files)
			
 
				+    logging.info("待处理文件数：%d" % total_num)
			
 
				 
			
 
				-    print("读取文件列表，待处理文件数：%d" % total_num)
			
 
				-
			
 
				-    with open(MERGE_OUTPUT_FILE, "w", encoding="utf-8") as f:
			
 
				+    with open(config.MERGE_FILE, "w", encoding="utf-8") as f:
			
 
				 
			
 
				         for i, file in enumerate(files):
			
 
				             zfile = zipfile.ZipFile(file)
			
@@ -59,10 +60,10 @@ def merge_file_content():
 
				                 realname = filename.encode('cp437').decode('gbk')
			
 
				                 
			
 
				                 # 排除无效文件
			
 
				-                if realname in EXCLUDE_FILES:
			
 
				+                if realname in config.MERGE_EXCLUDE_FILES:
			
 
				                     continue
			
 
				 
			
 
				-                print("正在处理文件: %s, 当前进度：%d / %d" % (realname, i, total_num))
			
 
				+                logging.info("正在处理文件: %s" % realname)
			
 
				 
			
 
				                 # 读取压缩文件中的文件
			
 
				                 with zfile.open(filename) as file_content:
			
@@ -73,8 +74,10 @@ def merge_file_content():
 
				                         # 只需要第一列的数据
			
 
				                         f.write(split[0])
			
 
				                         f.write("\n")
			
 
				+                
			
 
				+            tools.tip(total_num, i)
			
 
				 
			
 
				-    print("----------- 结束 -----------")
			
 
				+    tools.log_end_msg(TITLE)
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     merge_file_content()
			
--- a/stop_word.py
+++ b/stop_word.py
@@ -0,0 +1,65 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+from datetime import datetime
			
 
				+import os
			
 
				+import time
			
 
				+import tools
			
 
				+import config
			
 
				+import pickle
			
 
				+import logging
			
 
				+
			
 
				+TITLE = "停用词"
			
 
				+
			
 
				+def load_stop_word():
			
 
				+    """
			
 
				+    加载停用词
			
 
				+    """
			
 
				+
			
 
				+    # 判断是否存在缓存
			
 
				+    if os.path.exists(config.STOP_WORD_CACHE):
			
 
				+        logging.debug("存在停用词缓存")
			
 
				+        return tools.load_obj(config.STOP_WORD_CACHE)
			
 
				+
			
 
				+    logging.debug("正在构建停用词缓存")
			
 
				+
			
 
				+    # 停用词容器
			
 
				+    stop_word = []
			
 
				+
			
 
				+    # 构建停用词列表
			
 
				+    stop_word_files = os.listdir(config.STOP_WORD_DIR)
			
 
				+    for file in stop_word_files:
			
 
				+        stop_word_file = os.path.join(config.STOP_WORD_DIR, file)
			
 
				+        with open(stop_word_file, encoding=config.ENCODING_CHARSET) as f:
			
 
				+            for item in f:
			
 
				+                # 移除换行符
			
 
				+                stop_word.append(item.replace("\n",""))
			
 
				+    # 去重
			
 
				+    stop_word = list(set(stop_word))
			
 
				+
			
 
				+    # 把list改成dict提升检索速度
			
 
				+    stop_word_dict = {}
			
 
				+    for item in stop_word:
			
 
				+        stop_word_dict[item]=None
			
 
				+    
			
 
				+    logging.debug("把停用词缓存保存到本地")
			
 
				+
			
 
				+    # 保存本地作为缓存
			
 
				+    tools.save_obj(config.STOP_WORD_CACHE, stop_word_dict)
			
 
				+    
			
 
				+    return stop_word_dict
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    tools.init_log()
			
 
				+    tools.log_start_msg(TITLE)
			
 
				+
			
 
				+    stop_word = load_stop_word()
			
 
				+
			
 
				+    start = time.time()
			
 
				+    for i in range(1400*10000):
			
 
				+        for item in ["总之", "风雨无阻","千"]:
			
 
				+            item in stop_word
			
 
				+    end = time.time()
			
 
				+    print("耗时：", end - start)
			
 
				+
			
 
				+    tools.log_end_msg(TITLE)
			
--- a/tools.py
+++ b/tools.py
@@ -0,0 +1,184 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import math
			
 
				+import logging
			
 
				+import os
			
 
				+import config
			
 
				+import logging.config
			
 
				+import pickle
			
 
				+import mmap
			
 
				+
			
 
				+TITLE = "工具类"
			
 
				+
			
 
				+tip_internal_cache = {}
			
 
				+
			
 
				+def init_log():
			
 
				+    """
			
 
				+    日志初始化工具
			
 
				+    """
			
 
				+    # 读取日志配置文件内容
			
 
				+    logging.config.fileConfig('./logging.conf')
			
 
				+
			
 
				+    # 用一个没有在配置文件中定义的logger名称来创建一个日志器logger
			
 
				+    return logging.getLogger()
			
 
				+
			
 
				+def log_start_msg(msg):
			
 
				+    """
			
 
				+    执行开始时的简易日志输出
			
 
				+    """
			
 
				+    logging.info("-----------------%s 开始-----------------" % msg)
			
 
				+
			
 
				+def log_end_msg(msg):
			
 
				+    """
			
 
				+    执行结束时的简易日志输出
			
 
				+    """
			
 
				+    logging.info("-----------------%s 结束-----------------" % msg)
			
 
				+
			
 
				+def get_tip_internal(total_num):
			
 
				+    """
			
 
				+    计算进度提示间隔
			
 
				+    """
			
 
				+    # 尝试从缓存中获取
			
 
				+    internal = tip_internal_cache.get(total_num)
			
 
				+    # 不存在则进行计算并放入缓存中
			
 
				+    if not internal:
			
 
				+        internal = math.ceil(total_num * config.PRECENT_TIPS)
			
 
				+        tip_internal_cache[total_num] = internal
			
 
				+    return internal
			
 
				+    
			
 
				+
			
 
				+def tip(total_num, cur_num, is_zero_base=True):
			
 
				+    """
			
 
				+    简易进度提示
			
 
				+
			
 
				+    total_num 总数量
			
 
				+
			
 
				+    cur_num 当前进度（0基）
			
 
				+
			
 
				+    internal 提示间隔
			
 
				+    """
			
 
				+
			
 
				+    # TODO
			
 
				+    # 修改成百分比提示
			
 
				+
			
 
				+    internal = get_tip_internal(total_num)
			
 
				+
			
 
				+    # cur_num + 1 是0基修正
			
 
				+    if is_zero_base:
			
 
				+        cur_num = cur_num + 1
			
 
				+
			
 
				+    # 进度提示
			
 
				+    if cur_num == total_num:
			
 
				+        logging.info("当前进度 %d / %d" % (total_num, total_num))
			
 
				+    elif cur_num % internal == 0:
			
 
				+        logging.info("当前进度 %d / %d" % (cur_num, total_num))
			
 
				+
			
 
				+def tip_in_size(total_size, cur_pos):
			
 
				+    """
			
 
				+    简易进度提示(用于不知道总行数的情形)
			
 
				+
			
 
				+    total_size 总数量
			
 
				+
			
 
				+    cur_num 当前进度
			
 
				+    """
			
 
				+
			
 
				+    # 尝试从缓存中获取
			
 
				+    tip_internal = tip_internal_cache.get(total_size)
			
 
				+    if not tip_internal:
			
 
				+        # 不存在缓存，构建 提示检查点 和 提示间隔 信息
			
 
				+        internal = math.ceil(total_size * config.PRECENT_TIPS)
			
 
				+        tip_internal= {
			
 
				+            "check_point": cur_pos,
			
 
				+            "internal": internal
			
 
				+        }
			
 
				+        # 放入缓存
			
 
				+        tip_internal_cache[total_size] = tip_internal
			
 
				+    
			
 
				+    # 当前位置超过提示检查点则显示进度
			
 
				+    if cur_pos >= tip_internal["check_point"]:
			
 
				+
			
 
				+        logging.info("当前进度 %d / %d" % (cur_pos, total_size))
			
 
				+
			
 
				+        # 修改 提示检查点
			
 
				+        check_point = tip_internal["check_point"]
			
 
				+        internal = tip_internal["internal"]
			
 
				+
			
 
				+        while cur_pos >= check_point:
			
 
				+
			
 
				+            check_point = check_point + internal
			
 
				+
			
 
				+            # 如果 提示检查点大于总值，则置为总值
			
 
				+            if check_point > total_size:
			
 
				+                check_point = total_size
			
 
				+                # 如果不手动中断会陷入循环
			
 
				+                break
			
 
				+        
			
 
				+        # 更新 提示检查点
			
 
				+        tip_internal["check_point"] = check_point
			
 
				+
			
 
				+
			
 
				+    
			
 
				+
			
 
				+def save_obj(path, obj):
			
 
				+    """
			
 
				+    保存对象至本地
			
 
				+    """
			
 
				+    with open(path, "wb") as f:
			
 
				+        pickle.dump(obj, f)
			
 
				+
			
 
				+def load_obj(path):
			
 
				+    """
			
 
				+    加载对象
			
 
				+    """
			
 
				+    with open(path, "rb") as f:
			
 
				+        return pickle.load(f)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    init_log()
			
 
				+
			
 
				+    log_start_msg(TITLE)
			
 
				+
			
 
				+    # 测试普通提示
			
 
				+    # total = 3
			
 
				+    # for i in range(total):
			
 
				+    #     tip(total, i)
			
 
				+
			
 
				+    # 测试mmap的提示
			
 
				+    # with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
			
 
				+    #     mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
			
 
				+        
			
 
				+    #     # 总大小
			
 
				+    #     total_num = fmmap.size()
			
 
				+
			
 
				+    #     while True:
			
 
				+    #         # 读取光标位置
			
 
				+    #         cur_pos = fmmap.tell()
			
 
				+    #         # 把光标移动到下一行
			
 
				+    #         line = fmmap.readline()
			
 
				+
			
 
				+    #         # 进度显示
			
 
				+    #         tip_in_size(total_num, cur_pos)
			
 
				+
			
 
				+    #         if not line:
			
 
				+    #             break
			
 
				+
			
 
				+    # 测试逐行读取的进度提示
			
 
				+    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey:
			
 
				+
			
 
				+        fkey.seek(0, os.SEEK_END)
			
 
				+        total_num = fkey.tell()
			
 
				+        fkey.seek(0)
			
 
				+
			
 
				+        while True:
			
 
				+
			
 
				+            cur_pos = fkey.tell()
			
 
				+
			
 
				+            line = fkey.readline()
			
 
				+
			
 
				+            tip_in_size(total_num, cur_pos)
			
 
				+
			
 
				+            if not line:
			
 
				+                break;
			
 
				+    
			
 
				+    log_end_msg(TITLE)
			
--- a/长尾关键词分析.ipynb
+++ b/长尾关键词分析.ipynb
@@ -1,363 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 1,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import pandas as pd\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "ORIG_FILE = \"./data/范用性关键词-分词结果.csv\"\n",
			
 
				-    "DEST_FILE = \"./data/范用性关键词-分词结果-过滤停用词.csv\"\n",
			
 
				-    "DEST_FILE_FILTER = \"./data/范用性关键词-分词结果-过滤停用词-词频大于300.csv\"\n",
			
 
				-    "STOP_WORD_DIR = \"./data/stopwords\""
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 2,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "df = pd.read_csv(ORIG_FILE, names=['key','count'])"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 3,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/html": [
			
 
				-       "<div>\n",
			
 
				-       "<style scoped>\n",
			
 
				-       "    .dataframe tbody tr th:only-of-type {\n",
			
 
				-       "        vertical-align: middle;\n",
			
 
				-       "    }\n",
			
 
				-       "\n",
			
 
				-       "    .dataframe tbody tr th {\n",
			
 
				-       "        vertical-align: top;\n",
			
 
				-       "    }\n",
			
 
				-       "\n",
			
 
				-       "    .dataframe thead th {\n",
			
 
				-       "        text-align: right;\n",
			
 
				-       "    }\n",
			
 
				-       "</style>\n",
			
 
				-       "<table border=\"1\" class=\"dataframe\">\n",
			
 
				-       "  <thead>\n",
			
 
				-       "    <tr style=\"text-align: right;\">\n",
			
 
				-       "      <th></th>\n",
			
 
				-       "      <th>key</th>\n",
			
 
				-       "      <th>count</th>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "  </thead>\n",
			
 
				-       "  <tbody>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>0</th>\n",
			
 
				-       "      <td>怎么</td>\n",
			
 
				-       "      <td>1051516</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>1</th>\n",
			
 
				-       "      <td>的</td>\n",
			
 
				-       "      <td>123009</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>2</th>\n",
			
 
				-       "      <td>怎么办</td>\n",
			
 
				-       "      <td>93937</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>3</th>\n",
			
 
				-       "      <td>怎么样</td>\n",
			
 
				-       "      <td>91070</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>4</th>\n",
			
 
				-       "      <td>做</td>\n",
			
 
				-       "      <td>63034</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>...</th>\n",
			
 
				-       "      <td>...</td>\n",
			
 
				-       "      <td>...</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>116625</th>\n",
			
 
				-       "      <td>做文</td>\n",
			
 
				-       "      <td>1</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>116626</th>\n",
			
 
				-       "      <td>提微商</td>\n",
			
 
				-       "      <td>1</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>116627</th>\n",
			
 
				-       "      <td>仰卧</td>\n",
			
 
				-       "      <td>1</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>116628</th>\n",
			
 
				-       "      <td>起坐</td>\n",
			
 
				-       "      <td>1</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>116629</th>\n",
			
 
				-       "      <td>仰卧起坐</td>\n",
			
 
				-       "      <td>1</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "  </tbody>\n",
			
 
				-       "</table>\n",
			
 
				-       "<p>116630 rows × 2 columns</p>\n",
			
 
				-       "</div>"
			
 
				-      ],
			
 
				-      "text/plain": [
			
 
				-       "         key    count\n",
			
 
				-       "0         怎么  1051516\n",
			
 
				-       "1          的   123009\n",
			
 
				-       "2        怎么办    93937\n",
			
 
				-       "3        怎么样    91070\n",
			
 
				-       "4          做    63034\n",
			
 
				-       "...      ...      ...\n",
			
 
				-       "116625    做文        1\n",
			
 
				-       "116626   提微商        1\n",
			
 
				-       "116627    仰卧        1\n",
			
 
				-       "116628    起坐        1\n",
			
 
				-       "116629  仰卧起坐        1\n",
			
 
				-       "\n",
			
 
				-       "[116630 rows x 2 columns]"
			
 
				-      ]
			
 
				-     },
			
 
				-     "execution_count": 3,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "df"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 4,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "name": "stdout",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "去重前，停用词数量： 5778\n",
			
 
				-      "去重后，停用词数量： 2462\n"
			
 
				-     ]
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "# 导入停用词\n",
			
 
				-    "\n",
			
 
				-    "stop_word = [];\n",
			
 
				-    "\n",
			
 
				-    "stop_word_files = os.listdir(STOP_WORD_DIR)\n",
			
 
				-    "for file in stop_word_files:\n",
			
 
				-    "    stop_word_file = os.path.join(STOP_WORD_DIR, file)\n",
			
 
				-    "    with open(stop_word_file) as f:\n",
			
 
				-    "        for item in f:\n",
			
 
				-    "            stop_word.append(item.replace(\"\\n\",\"\"))\n",
			
 
				-    "print(\"去重前，停用词数量：\", len(stop_word))\n",
			
 
				-    "stop_word = list(set(stop_word))\n",
			
 
				-    "print(\"去重后，停用词数量：\", len(stop_word))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 5,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# 过滤停用词\n",
			
 
				-    "df = df[df.apply(lambda row : row['key'] not in stop_word, axis=1)]"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 107,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# 导出过滤后的数据，不要表头和行号\n",
			
 
				-    "df.to_csv(DEST_FILE, header=False, index=False)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 6,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/html": [
			
 
				-       "<div>\n",
			
 
				-       "<style scoped>\n",
			
 
				-       "    .dataframe tbody tr th:only-of-type {\n",
			
 
				-       "        vertical-align: middle;\n",
			
 
				-       "    }\n",
			
 
				-       "\n",
			
 
				-       "    .dataframe tbody tr th {\n",
			
 
				-       "        vertical-align: top;\n",
			
 
				-       "    }\n",
			
 
				-       "\n",
			
 
				-       "    .dataframe thead th {\n",
			
 
				-       "        text-align: right;\n",
			
 
				-       "    }\n",
			
 
				-       "</style>\n",
			
 
				-       "<table border=\"1\" class=\"dataframe\">\n",
			
 
				-       "  <thead>\n",
			
 
				-       "    <tr style=\"text-align: right;\">\n",
			
 
				-       "      <th></th>\n",
			
 
				-       "      <th>count</th>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "  </thead>\n",
			
 
				-       "  <tbody>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>count</th>\n",
			
 
				-       "      <td>115534.000000</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>mean</th>\n",
			
 
				-       "      <td>27.613802</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>std</th>\n",
			
 
				-       "      <td>311.900416</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>min</th>\n",
			
 
				-       "      <td>1.000000</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>25%</th>\n",
			
 
				-       "      <td>1.000000</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>50%</th>\n",
			
 
				-       "      <td>2.000000</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>75%</th>\n",
			
 
				-       "      <td>6.000000</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "    <tr>\n",
			
 
				-       "      <th>max</th>\n",
			
 
				-       "      <td>63034.000000</td>\n",
			
 
				-       "    </tr>\n",
			
 
				-       "  </tbody>\n",
			
 
				-       "</table>\n",
			
 
				-       "</div>"
			
 
				-      ],
			
 
				-      "text/plain": [
			
 
				-       "               count\n",
			
 
				-       "count  115534.000000\n",
			
 
				-       "mean       27.613802\n",
			
 
				-       "std       311.900416\n",
			
 
				-       "min         1.000000\n",
			
 
				-       "25%         1.000000\n",
			
 
				-       "50%         2.000000\n",
			
 
				-       "75%         6.000000\n",
			
 
				-       "max     63034.000000"
			
 
				-      ]
			
 
				-     },
			
 
				-     "execution_count": 6,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "df.describe()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 7,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "name": "stdout",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "大于1000的数量： 418\n",
			
 
				-      "大于500的数量： 1035\n",
			
 
				-      "大于400的数量： 1340\n",
			
 
				-      "大于300的数量： 1883\n",
			
 
				-      "大于250的数量： 2282\n",
			
 
				-      "大于100的数量： 5104\n"
			
 
				-     ]
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "print(\"大于1000的数量：\", df[df['count'] > 1000].count().key)\n",
			
 
				-    "print(\"大于500的数量：\", df[df['count'] > 500].count().key)\n",
			
 
				-    "print(\"大于400的数量：\", df[df['count'] > 400].count().key)\n",
			
 
				-    "print(\"大于300的数量：\", df[df['count'] > 300].count().key)\n",
			
 
				-    "print(\"大于250的数量：\", df[df['count'] > 250].count().key)\n",
			
 
				-    "print(\"大于100的数量：\", df[df['count'] > 100].count().key)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 9,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# 过滤词频小于300的数据\n",
			
 
				-    "df=df[df['count']>=300]"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 10,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# 导出过滤后的数据，不要表头和行号，只要关键词列\n",
			
 
				-    "df.to_csv(DEST_FILE_FILTER, header=False)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 16,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "df[5:7].to_csv(\"./data/多线城-3.csv\", header=False)"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "interpreter": {
			
 
				-   "hash": "679ecc657d123b537eb7946f00483c298ba68f4074c79757b9e8823d90af42fb"
			
 
				-  },
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3.9.0 ('jieba')",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.9.0"
			
 
				-  },
			
 
				-  "orig_nbformat": 4
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 2
			
 
				-}