ChenGanBin пре 3 година
родитељ
комит
2217b0b5bd
22 измењених фајлова са 948 додато и 1448 уклоњено
  1. 6 1
      .gitignore
  2. 20 0
      REMEAD.md
  3. 155 0
      agg_word.py
  4. 0 143
      analyse.py
  5. 0 146
      analyse2.py
  6. 74 0
      cal.py
  7. 43 0
      config.py
  8. 97 0
      cut.py
  9. 0 228
      cut_multiprocess.py
  10. 0 176
      cut_multiprocess2.py
  11. 0 147
      cut_statistics.py
  12. 0 185
      data/category/白凉粉是什么东西.txt
  13. 0 44
      data/category/腋下长了一个小疙瘩是什么东西.txt
  14. 52 0
      key.py
  15. 58 0
      key_index.py
  16. 91 0
      key_reverse.py
  17. 57 0
      key_reverse_index.py
  18. 28 0
      logging.conf
  19. 18 15
      merge.py
  20. 65 0
      stop_word.py
  21. 184 0
      tools.py
  22. 0 363
      长尾关键词分析.ipynb

+ 6 - 1
.gitignore

@@ -5,4 +5,9 @@ data/tmp/*.csv
 data/pkl/*.pkl
 data/*.txt
 data/*.csv
-临时/
+临时/
+data_bak/
+src_bak/
+data/analyse/
+data/analyse_bak/
+data/cache/

+ 20 - 0
REMEAD.md

@@ -0,0 +1,20 @@
+# 处理步骤
+
+1. 从5118下载泛词(csv文件)
+
+2. 对泛词进行分词处理(cut.py)
+
+    * 分词和词频统计
+    * 根据词频进行倒序排列
+
+3. 根据词频获取拓展词
+
+4. 把所有拓展词合并到一个文件中(merge.py)
+
+5. 生成关键词文件,包含三个要素:序号、关键词、分词结果(key.py)
+
+6. 对关键词文件生成索引文件(key_index.py)
+
+7. 根据关键词文件生成倒排文件(key_reverse.py)
+
+8. 根据关键词文件、索引文件、倒排文件生成最终的聚合分析文件(agg_word.py)

+ 155 - 0
agg_word.py

@@ -0,0 +1,155 @@
+# -*- coding:utf-8 -*-
+
+import mmap
+import config
+import tools
+import stop_word
+import re
+import ast
+import cal
+import logging
+import ast
+from bitmap import BitMap
+
+TITLE = "聚合文件"
+
+def re_extract_key(pattern, line):
+    """
+    正则提取关键词信息
+    """
+    m = pattern.match(line)
+    # 关键词 序号
+    index = m.group(1)
+    # 关键词 
+    key = m.group(2)
+    # 关键词 分词词根
+    word_root = m.group(3)
+    # 把index转换成数字方便使用
+    return int(index), key, word_root
+
+def main():
+    # 初始化日志配置
+    tools.init_log()
+    tools.log_start_msg(TITLE)
+
+    # 停用词
+    logging.info("加载停用词")
+    stop_word_cache = stop_word.load_stop_word()
+    # 关键词索引
+    logging.info("加载关键词索引")
+    key_index_cache = tools.load_obj(config.KEY_INDEX_CACHE)
+    # 倒排索引
+    logging.info("加载倒排索引")
+    key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
+    # 正则 提取数据
+    s = r"(\d+),([^,]*),(.*)"
+    pattern = re.compile(s, re.I)
+
+    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
+        open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
+        mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap, \
+        mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as f_reverse_mmap:
+
+        # 计算总关键词数
+
+        # TODO 这里要改成从统计信息中获取
+        total_count = 14500029
+        
+        # 生成位图bitmap
+        bm = BitMap(total_count)
+
+        # 待处理的文件总大小
+        total_num = f_key_mmap.size()
+
+        while True:
+            # 当前处理位置
+            cur_pos = f_key_mmap.tell()
+
+            # 进度提示
+            tools.tip_in_size(total_num, cur_pos)
+
+            # 获取要处理的关键词
+            line = f_key_mmap.readline().decode(config.ENCODING_CHARSET)
+            
+            # 如果没有任何内容则结束 
+            if not line:
+                logging.info("发现空白line")
+                break
+
+            # 提取信息
+            index, key, word_root = re_extract_key(pattern, line)
+            
+            # bitmap校验,如果已经处理过则跳过
+            if bm.test(index):
+                logging.debug("主关键词:%s 已处理,跳过" % key)
+                continue
+
+            # 通过bitmap校验,设置对应的bit为0
+            bm.set(index)
+
+            # 聚合结果存放容器
+            agg_cache = []
+
+            # 记录主要关键词
+            agg_cache.append(key)
+
+            # 转换成真正的list对象
+            logging.debug("当前处理的主关键词:%s, 词根数量:%d" % (key, len(word_root)))
+            for item in ast.literal_eval(word_root):
+                # 排除停用词
+                if item in stop_word_cache:
+                    continue
+                
+                # 根据倒排索引,获取相关的关键词序号
+                other_key_pos = key_reverse_index_cache.get(item)
+                f_reverse_mmap.seek(other_key_pos)
+                other_key_line = f_reverse_mmap.readline().decode(config.ENCODING_CHARSET)
+                # 截取关键词索引部分
+                other_index = other_key_line.index(",")
+                other_key_indexs = other_key_line[other_index+1:]
+                # 转换成真正的list对象
+                other_key_indexs = ast.literal_eval(other_key_indexs)
+                if not other_key_indexs:
+                    continue
+
+                logging.debug("词根:%s, 涉及的其它关键词数量:%d" % (item, len(other_key_indexs)))
+                for other_key_index in other_key_indexs:
+                    # bitmap校验,如果已经处理过则跳过
+                    if bm.test(int(other_key_index)):
+                        logging.debug("待比较关键词:%s 已处理,跳过" % other_key_index)
+                        continue
+
+                    # 从关键词索引中获取关键词位置
+                    pos = key_index_cache[other_key_index]
+                    # 获取待比较的关键词
+                    f_key_mmap.seek(pos)
+                    other_key_line = f_key_mmap.readline().decode(config.ENCODING_CHARSET)
+                    other_key_index, other_key,other_word_root = re_extract_key(pattern, other_key_line)
+
+                    # 计算相关性
+                    val = cal.cal_cos(key, other_key, word_root, other_word_root)
+                    if val >= 0.8:
+                        # 设置bitmap,该关键词已经处理过
+                        bm.set(other_key_index)
+
+                        # 记录类似的关键词
+                        agg_cache.append(other_key)
+        
+            # 保存到本地
+            with open(config.AGG_ANALYSE_FILE % key, "w", encoding=config.ENCODING_CHARSET) as f:
+                for item in agg_cache:
+                    f.write(item)
+                    f.write("\n")
+
+            # 如果所有的关键词都处理完则结束
+            if bm.all():
+                logging.info("bitmap全部为1")
+                break
+            else:
+                count = bm.count()
+                logging.info("已处理数量:%d / %d,剩余数量:%d / %d" % (count, total_count, (total_count - count), total_count))
+
+    tools.log_end_msg(TITLE)
+
+if __name__ == "__main__":
+    main()

+ 0 - 143
analyse.py

@@ -1,143 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import math
-import jieba
-import re
-import os
-import pickle
-
-word_dict = {}
-
-SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
-
-def cut_word(word):
-    """
-    分词
-    """
-    word_root = jieba.cut_for_search(word)
-    return list(word_root)
-
-def merge_word_root(word_root_a, word_root_b):
-    """
-    合并词根
-    """
-    return list(set(word_root_a).union(set(word_root_b)))
-
-def gen_word_vector(word_a, word_b, word_root_union):
-    """
-    生成词向量
-    """
-    a_word_vector, b_word_vector = [], []
-    for word in word_root_union:
-        if word in SPECIAL_SIMBOL :
-            word = "\\" + word
-        a_word_vector.append(len(re.findall(word, word_a)))
-        b_word_vector.append(len(re.findall(word, word_b)))
-    return a_word_vector, b_word_vector
-
-def vector_multi(a_vector, b_vector):
-    """
-    向量相乘求和
-    """
-    return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
-
-def vector_square_sum(word_vector):
-    """
-    向量平方求和
-    """
-    sum = 0
-    for i in word_vector:
-        sum = sum + i * i
-    return sum
-
-def vector_cos(v_multi, a_v_ss, b_v_ss):
-    """
-    计算余弦值
-    """
-    return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
-
-def cal_cos(a_word, b_word, word_dict):
-    """
-    计算两个长尾关键词的余弦值
-    """
-    # a_word_root = cut_word(a_word)
-    # b_word_root = cut_word(b_word)
-
-    a_word_root = word_dict[a_word]
-    b_word_root = word_dict[b_word]
-
-    # 合并词根,用于生成词向量
-    union_word_root = merge_word_root(a_word_root, b_word_root)
-
-    # 生成词向量
-    a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
-
-    # 词向量相乘求和
-    ab_vector_multi = vector_multi(a_vector, b_vector)
-
-    # 向量平方求和
-    a_vector_squar_sum = vector_square_sum(a_vector)
-    b_vector_squar_sum = vector_square_sum(b_vector)
-
-    cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
-
-    return cos_val
-
-
-def load_word_root_cache():
-    word_root_cache = {}
-
-    if os.path.exists("./data/pkl/word_root_cache.pkl"):
-        print("存在缓存,开始加载")
-        with open("./data/pkl/word_root_cache.pkl", "rb") as f:
-            word_root_cache = pickle.load(f)
-            return word_root_cache
-
-    print('不存在缓存,开始构建分词字典')
-    with open("./data/分词结果_bak.txt", "r", encoding="UTF-8") as f:
-        lines = f.readlines()
-        for line in lines:
-            index = line.index(",")
-            word_root_cache[line[:index]] = line[index+1:]
-    
-    print("构建完成,保存到本地")
-    with open("./data/pkl/word_root_cache.pkl", "wb") as f:
-        pickle.dump(word_root_cache, f)
-    
-    return word_root_cache
-
-word_dict = load_word_root_cache()
-
-key_list = list(word_dict.keys())
-for i, a_key in enumerate(key_list[:-1]):
-    with open("./data/category/%s.txt" % a_key, "w", encoding="UTF-8") as f:
-        f.write(a_key)
-        f.write("\n\n")
-
-        del_container = []
-        for j, b_key in  enumerate(key_list[i+1:]):
-            if j % 100000 == 0 :
-                print("正在处理:%d, %d" % (i, j))
-            cos_val = cal_cos(a_key, b_key, word_dict)
-            if cos_val > 0.8 :
-                print("%s 与 %s 的余弦值:%f " % (a_key, b_key, cos_val))
-                f.write(b_key)
-                f.write("\n")
-        
-        key_list.remove(a_key)
-        if len(del_container) > 0:
-            print("删除已处理的元素")
-            for item in del_container:
-                key_list.remove(item)
-
-
-
-# a_word = "腋下长了一个小疙瘩是什么东西"
-# b_word = "什么东西吃蟑螂(四个字)"
-# cos_val = cal_cos(a_word, b_word)
-# print(cos_val)
-# print("的余弦值:%f " % ( cos_val))
-# print(cut_word(b_word))
-
-# 1. 使用缓存中的分词
-# 2. 余弦值超过0.8的视为一组

+ 0 - 146
analyse2.py

@@ -1,146 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import os
-import re
-import pickle
-import math
-from multiprocessing import Manager, Process
-
-
-# 分词结果
-CUT_WORD_RESULT = "./data/分词结果_bak.txt"
-
-# 分词缓存
-CUT_WORD_CACHE = "./data/pkl/word_root_cache.pkl"
-
-# 分析保存结果
-ANALYSE_OUTPUT_FILE = "./data/category/%s.txt"
-
-# 正则表达式中的特殊符号
-SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
-
-def merge_word_root(word_root_a, word_root_b):
-    """
-    合并词根
-    """
-    return list(set(word_root_a).union(set(word_root_b)))
-
-def gen_word_vector(word_a, word_b, word_root_union):
-    """
-    生成词向量
-    """
-    a_word_vector, b_word_vector = [], []
-    for word in word_root_union:
-        if word in SPECIAL_SIMBOL :
-            word = "\\" + word
-        a_word_vector.append(len(re.findall(word, word_a)))
-        b_word_vector.append(len(re.findall(word, word_b)))
-    return a_word_vector, b_word_vector
-
-def vector_multi(a_vector, b_vector):
-    """
-    向量相乘求和
-    """
-    return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
-
-def vector_square_sum(word_vector):
-    """
-    向量平方求和
-    """
-    sum = 0
-    for i in word_vector:
-        sum = sum + i * i
-    return sum
-
-def vector_cos(v_multi, a_v_ss, b_v_ss):
-    """
-    计算余弦值
-    """
-    return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
-
-def cal_cos(a_word, b_word, word_dict):
-    """
-    计算两个长尾关键词的余弦值
-    """
-    a_word_root = word_dict[a_word]
-    b_word_root = word_dict[b_word]
-
-    # 合并词根,用于生成词向量
-    union_word_root = merge_word_root(a_word_root, b_word_root)
-
-    # 生成词向量
-    a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
-
-    # 词向量相乘求和
-    ab_vector_multi = vector_multi(a_vector, b_vector)
-
-    # 向量平方求和
-    a_vector_squar_sum = vector_square_sum(a_vector)
-    b_vector_squar_sum = vector_square_sum(b_vector)
-
-    cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
-
-    return cos_val
-
-def process(global_word_root, global_del_cache, a_key, keys):
-    container = []
-    total_num = len(keys)
-    for j, b_key in  enumerate(keys):
-        if j % 100000 == 0 :
-            print("处理进度:%d / %d" % (j, total_num))
-        cos_val = cal_cos(a_key, b_key, global_word_root)
-        if cos_val > 0.8 and b_key not in global_del_cache :
-            print("%s 与 %s 的余弦值:%f " % (a_key, b_key, cos_val))
-            container.append(b_key)
-            global_del_cache.append(b_key)
-
-    with open(ANALYSE_OUTPUT_FILE % a_key, "w", encoding="UTF-8") as f:
-        f.write(a_key)
-        f.write("\n")
-        for b_key in container:
-            f.write(b_key)
-            f.write("\n")
-
-def load_word_root_cache():
-    """
-    加载分词缓存
-    """
-
-    word_root_cache = {}
-
-    if os.path.exists(CUT_WORD_CACHE):
-        print("存在缓存,开始加载")
-        with open(CUT_WORD_CACHE, "rb") as f:
-            word_root_cache = pickle.load(f)
-            return word_root_cache
-
-    print('不存在缓存,开始构建分词字典')
-    with open(CUT_WORD_RESULT, "r", encoding="UTF-8") as f:
-        lines = f.readlines()
-        for line in lines:
-            index = line.index(",")
-            word_root_cache[line[:index]] = line[index+1:]
-    
-    print("构建完成,保存到本地")
-    with open(CUT_WORD_CACHE, "wb") as f:
-        pickle.dump(word_root_cache, f)
-    
-    return word_root_cache
-
-
-def main():
-
-    word_root_cache = load_word_root_cache();
-
-    keys = [key for key in word_root_cache.keys()]
-
-    manager = Manager()
-    global_word_root = manager.dict(word_root_cache)
-    global_del_cache = manager.list()
-
-    p = Process(target=process, args=(global_word_root, global_del_cache, keys[0], keys[1:]))
-    p.join()
-
-
-if __name__ == "__main__":
-    main()

+ 74 - 0
cal.py

@@ -0,0 +1,74 @@
+# -*- coding:utf-8 -*-
+
+import config
+import re
+import math
+
+def merge_word_root(word_root_a, word_root_b):
+    """
+    合并词根
+    """
+    return list(set(word_root_a).union(set(word_root_b)))
+
+def gen_word_vector(word_a, word_b, word_root_union):
+    """
+    生成词向量
+    """
+    a_word_vector, b_word_vector = [], []
+    for word in word_root_union:
+        if word in config.RE_SPECIAL_SIMBOL :
+            word = "\\" + word
+        a_word_vector.append(len(re.findall(word, word_a)))
+        b_word_vector.append(len(re.findall(word, word_b)))
+    return a_word_vector, b_word_vector
+
+def vector_multi(a_vector, b_vector):
+    """
+    向量相乘求和
+    """
+    return sum(map(lambda a_b: a_b[0]*a_b[1], zip(a_vector, b_vector)))
+
+def vector_square_sum(word_vector):
+    """
+    向量平方求和
+    """
+    sum = 0
+    for i in word_vector:
+        sum = sum + i * i
+    return sum
+
+def vector_cos(v_multi, a_v_ss, b_v_ss):
+    """
+    计算余弦值
+    """
+    return v_multi / (math.sqrt(a_v_ss) * math.sqrt(b_v_ss))
+
+
+def cal_cos(a_word, b_word, a_word_root, b_word_root):
+    """
+    计算两个长尾关键词的余弦值
+    """
+
+    # 合并词根,用于生成词向量
+    union_word_root = merge_word_root(a_word_root, b_word_root)
+
+    # 生成词向量
+    a_vector, b_vector = gen_word_vector(a_word, b_word, union_word_root)
+
+    # 词向量相乘求和
+    ab_vector_multi = vector_multi(a_vector, b_vector)
+
+    # 向量平方求和
+    a_vector_squar_sum = vector_square_sum(a_vector)
+    b_vector_squar_sum = vector_square_sum(b_vector)
+
+    cos_val = vector_cos(ab_vector_multi, a_vector_squar_sum, b_vector_squar_sum)
+
+    return cos_val
+
+if __name__ == "__main__":
+    a_word = "腋下长了一个小疙瘩是什么东西"
+    b_word = "白凉粉是什么东西"
+    a_word_root = ['腋下', '长', '了', '一个', '小', '疙瘩', '是', '什么', '东西']
+    b_word_root = ['白', '凉粉', '是', '什么', '东西']
+    print(cal_cos(a_word, b_word, a_word_root, b_word_root))

+ 43 - 0
config.py

@@ -0,0 +1,43 @@
+# -*- coding:utf-8 -*-
+
+# 文件编码格式
+ENCODING_CHARSET = "UTF-8"
+
+# 分词与词频统计
+CUT_FILE = "./data/tmp/cut.csv"
+
+# 拓展词合并文件
+MERGE_FILE = "./data/tmp/merge.csv"
+
+# 排除合并的文件
+MERGE_EXCLUDE_FILES = ['打开乱码如何处理?.txt']
+
+# 关键词文件(包含三要素:序号、关键词、词根)
+KEY_FILE = "./data/tmp/key.csv"
+
+# 关键词索引文件(包含两个要素:关键词序号、在文件中的位置)(暂时弃用)
+KEY_INDEX_FILE = "./data/tmp/key_index.csv"
+
+# 关键词索引模型 缓存 (包含两个要素:关键词序号、在文件中的位置)
+KEY_INDEX_CACHE = "./data/cache/key_index.pkl"
+
+# 关键词倒排文件(包含两个要素:词根、关键词序号)
+KEY_REVERSE_FILE = "./data/tmp/key_reverse.csv"
+
+# 关键词倒排索引模型 缓存 (包含两个要素:词根、位置)
+KEY_REVERSE_INDEX_CACHE = "./data/cache/key_reverse_index.pkl"
+
+# 最终的聚合分析结果存放文件
+AGG_ANALYSE_FILE = "./data/analyse/%s.csv"
+
+# 停用词存放文件夹
+STOP_WORD_DIR = "./data/stopwords"
+
+# 停用词模型 缓存
+STOP_WORD_CACHE = "./data/cache/stop_word.pkl"
+
+# 正则表达式中需要额外处理的特殊符号
+RE_SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
+
+# 百分比进度提示
+PRECENT_TIPS = 0.01

+ 97 - 0
cut.py

@@ -0,0 +1,97 @@
+# -*- coding:utf-8 -*-
+
+import config
+import os
+import tools
+import jieba
+import logging
+import logging.config
+
+from stop_word import load_stop_word
+
+TITLE = "分词处理"
+
+# 待处理的数据文件
+DATA_FILE = "E:\Download\怎么长尾词_1655561719.csv"
+
+def cut_word_and_statistics(data):
+
+    """
+    分词并统计词频
+    """
+
+    logging.info("开始执行分词操作并进行词频统计")
+
+    # 分词结果容器
+    key_dict = {}
+    # 停用词
+    stop_word = load_stop_word()
+    # 待处理数据总数量
+    total_num = len(data)
+
+    logging.info("共需处理 %d 条数据" % total_num)
+
+    for i, item in enumerate(data):
+        # 只需要第一列的数据
+        longTailKey = item.split(",")[0]
+        # 移除换行符
+        longTailKey = longTailKey.replace("\n", "")
+        # 分词
+        cutWord = jieba.cut_for_search(longTailKey)
+
+        # 统计
+        for word in cutWord:
+
+            # 过滤停用词
+            if word in stop_word:
+                continue
+
+            if word in key_dict:
+                key_dict[word] = key_dict[word] + 1
+            else:
+                key_dict[word] = 1
+        
+        # 进度提示
+        tools.tip(total_num, i)
+        
+
+    # 根据词频倒序排列
+    logging.info("根据词频进行倒序排列")
+    sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
+
+    logging.info("分词操作并进行词频统计 结束")
+
+    return sorted_key_list
+
+def main():
+
+    # 日志初始化
+    tools.init_log()
+
+    tools.log_start_msg(TITLE)
+
+    if not os.path.exists(DATA_FILE):
+        logging.warning("待处理的数据文件不存在:%s" % DATA_FILE)
+        return
+
+    # 读取数据
+    logging.info("正在读取待处理的数据文件:%s" % DATA_FILE)
+    lines = None
+    with open(DATA_FILE, "r", encoding=config.ENCODING_CHARSET) as f:
+        lines = f.readlines()
+    
+    # 执行分词和词频统计(跳过前两行)
+    word_root_list = cut_word_and_statistics(lines[2:])
+
+    # 导出数据
+    logging.info("正在导出分词数据,位置:%s" % config.CUT_FILE)
+    with open(config.CUT_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
+        for key, count in word_root_list:
+            f.write("%s,%d\n" % (key, count))
+
+    tools.log_end_msg(TITLE)
+
+
+if __name__ == '__main__':
+    main()
+

+ 0 - 228
cut_multiprocess.py

@@ -1,228 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import datetime
-import os
-import math
-import pickle
-from time import sleep
-import jieba
-from multiprocessing import Process, Manager
-
-from zmq import QUEUE
-
-# TODO
-# 1. 研究jieba多进程切词(windows上无法使用自带的多进程切词功能)
-# 2. 进一步减少断点保存的次数(即调整保存间隔,或者全内存中)
-# 3. 加入在分词完成后保存结果以防丢失
-# 4. 在每个进程中分别保存分词结果,然后再统一合并
-
-
-
-# 待处理的数据文件路径
-DATA_FILE = './data/合并结果.txt'
-
-# 分词保存
-CUT_OUTPUT_FILE = './data/分词结果.txt'
-
-# 是否分词结束后保存结果
-IS_ASSUME_TOTAL = True
-
-# 是否断点续存
-IS_ASSUME = False
-
-# 是否测试模式
-IS_TEST_MODE = False
-
-# 测试使用的数据量
-TEST_DATA_NUM = 100 * 10000
-
-# 测试模式时断点续存的保存间隔
-TEST_SAVE_INTERNAL = 200
-
-# 编码
-ENCODING_CHARSET = "UTF-8"
-
-# 配置文件路径
-CONFIG_PATH = "./data/pkl/cut_config_%d.pkl"
-
-# 处理进程数量
-PROCESS_NUM = os.cpu_count()
-
-# 保存间隔(多久保存一次)
-SAVE_INTERNAL = TEST_SAVE_INTERNAL if IS_TEST_MODE else 1000000
-
-# 处理进度提醒间隔
-PROCESS_TIPS_INTERNAL = 10 * 10000
-
-def save_config(config_path, config_obj):
-    """
-    保存配置文件
-    """
-    with open(config_path, "wb") as f:
-        pickle.dump(config_obj, f)
-
-
-def load_config(config_path):
-    """
-    加载配置文件
-    """
-    with open(config_path, "rb") as f:
-        return pickle.load(f)
-
-def cut_word(word):
-    """
-    分词
-    """
-    word_root = jieba.cut_for_search(word)
-    return list(word_root)
-
-def multiprocess_cut_word(process_name, data_list, result_dict, config_path, cut_config):
-
-    """
-    多进程进行分词处理
-    """
-
-    print('进程:%s -> 分词处理开始' % process_name)
-
-    if (IS_ASSUME_TOTAL or IS_ASSUME) and os.path.exists(config_path) :
-        cut_config = load_config(config_path)
-        print("进程:%s -> 进断点恢复 当前状态:%s,开始处理位置:%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
-
-    if cut_config['state'] == 'run':
-
-        # 获取带分词的数据
-        lines = data_list[cut_config['current_pos']:cut_config['end_pos']]
-
-        # 统计需要处理的数据量
-        total_num = len(lines)
-        print("进程:%s ->剩余待处理数量:%d" % (process_name, total_num))
-
-        for i, line in enumerate(lines):
-            # 数据处理
-            line = line.replace("\n", "")
-            # 分词
-            cut_config["word_dict"][line]=cut_word(line)
-
-            # 断点保存
-            if IS_ASSUME and i > 0 and i % SAVE_INTERNAL == 0:
-                cut_config["current_pos"] = cut_config["current_pos"] + SAVE_INTERNAL
-                print("进程:%s -> 断点保存 当前状态:%s,当前处理位置:%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
-                save_config(config_path, cut_config)
-            
-            # 处理进度提示
-            if i > 0 and i % PROCESS_TIPS_INTERNAL == 0:
-                print("进程:%s -> 当前处理进度:%d / %d" % (process_name, i, total_num))
-
-        # 最终结果保存
-        if IS_ASSUME_TOTAL or IS_ASSUME:
-            print("进程:%s -> 保存最终的分词结果" % process_name)
-            cut_config["state"] = "end"
-            cut_config["current_pos"] = cut_config['end_pos']
-            save_config(config_path, cut_config)
-        
-        # result_dict.update(cut_config["word_dict"])
-        result_dict[process_name]=cut_config["word_dict"]
-
-        print('进程:%s -> 分词处理结束' % process_name)
-    else :
-        # result_dict.update(cut_config['word_dict'])
-        result_dict[process_name]=cut_config["word_dict"]
-        print('进程:%s -> 断点恢复,分词处理结束' % process_name)
-
-def main():
-
-    print("开始时间:", datetime.datetime.now())
-
-    print("配置:启动用断点续存,保存间隔:%d" % SAVE_INTERNAL if IS_ASSUME else "配置:不启用断点续存")
-    print("配置:保存最终的分词结果" if IS_ASSUME_TOTAL else "配置:不保存最终的分词结果")
-
-    # 处理进程容器
-    process_list = []
-    # 配置文件容器
-    config_list = []
-
-    # 设置多进程共享变量
-    manager = Manager()
-    # 多进程共享的数据源
-    global_list = manager.list()
-    # 多进程返回的结果
-    result_dict = manager.dict()
-
-    print("加载数据")
-    with open(DATA_FILE, "r", encoding="UTF-8") as f:
-        if IS_TEST_MODE:
-            print("当前处于测试模式,测试数据量:%d" % TEST_DATA_NUM)
-            global_list.extend(f.readlines()[:TEST_DATA_NUM])
-        else:
-            global_list.extend(f.readlines())
-    
-    total_len = len(global_list)
-    count = math.ceil(total_len / PROCESS_NUM)
-    print("待处理总数量:%d, 数量区间:%d" % (total_len, count))
-
-    # 构造配置
-    for i in range(PROCESS_NUM):
-        start_pos = i * count
-        end_pos = i * count + count
-        if end_pos >= total_len :
-            end_pos = -1
-        cut_config = {
-            "state": "run",
-            "start_pos": start_pos,
-            "current_pos": start_pos,
-            "end_pos": end_pos,
-            "word_dict": {}
-        }
-        config_list.append(cut_config)
-
-    print("配置", config_list)
-
-    for i, config in enumerate(config_list):
-        p = Process(target=multiprocess_cut_word, args=("进程-%d" % i, global_list, result_dict, CONFIG_PATH % i, config))
-        p.start()
-        process_list.append(p)
-
-    for p in process_list:
-        p.join()
-
-    print("合并最终的分词结果:开始")
-
-    result = []
-    print("处理成list便于写入文件")
-    for (process_name, word_dict) in result_dict.items():
-        tmp = None
-        for (key, value) in word_dict.items():
-            tmp = ["%s,%s\n" % (key, value) for (key, value) in word_dict.items() ]
-        result.extend(tmp)
-    print("写入文件")
-    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
-        f.writelines(result)
-    # with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
-    #     for (process_name, word_dict) in result_dict.items():
-    #         for (key, value) in word_dict.items():
-    #             f.write("%s,%s\n" % (key, value))
-    #             # f.write("\n")
-    print("合并最终的分词结果:结束")
-
-    print("结束时间:", datetime.datetime.now())
-
-def main2():
-    print("开始时间:", datetime.datetime.now())
-
-    with open(CUT_OUTPUT_FILE, "a", encoding=ENCODING_CHARSET) as f:
-        for i in range(4):
-            config_p = CONFIG_PATH % i
-            print("时间:%s, 读取:%s —— 开始" % (datetime.datetime.now(), config_p))
-            config = load_config(config_p)
-            print("时间:%s, 读取:%s —— 结束" % (datetime.datetime.now(), config_p))
-
-            print("时间:%s,写入文件 -- 开始"% datetime.datetime.now())
-            for (key, value) in config["word_dict"].items():
-                f.write("%s,%s\n" % (key, value))
-            print("时间:%s,写入文件 -- 结束"% datetime.datetime.now())
-
-    print("结束时间:", datetime.datetime.now())
-
-
-if __name__ == '__main__':
-    main2()

+ 0 - 176
cut_multiprocess2.py

@@ -1,176 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import datetime
-import os
-import math
-import jieba
-from multiprocessing import Process, Manager
-
-
-# 待处理的数据文件路径
-DATA_FILE = './data/合并结果.txt'
-
-# 分词保存
-CUT_OUTPUT_FILE = './data/分词结果.txt'
-
-# 消费者进程数量
-CONSUMER_NUM = 1
-
-# 生产者进程数量
-# PRODUCER_NUM = os.cpu_count() - CONSUMER_NUM
-PRODUCER_NUM = 1
-
-# 是否测试模式
-IS_TEST_MODE = False
-
-# 测试使用的数据量
-TEST_DATA_NUM = 100 * 10000
-
-# 编码
-ENCODING_CHARSET = "UTF-8"
-
-# 发送至消息队列的间隔
-SEND_INTERNAL = 1 * 10000
-
-# 处理进度提醒间隔
-PROCESS_TIPS_INTERNAL = 10 * 10000
-
-
-def cut_word(word):
-    """
-    分词
-    """
-    word_root = jieba.cut_for_search(word)
-    return list(word_root)
-
-def consumer(queue):
-    """
-    消费者,把数据保存在指定位置
-    """
-
-    print("消费者:启动")
-
-    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
-        
-        while True:
-
-            msg = queue.get()
-
-            if "quit" == msg.get("command"):
-                print("消费者:接收到结束命令")
-                break
-
-            if len(msg['payload']) > 0:
-                for item in msg['payload']:
-                    f.write("%s,%s\n" % (item['key'], item['value']))
-    
-    print("消费者:结束")
-
-def producer(data, queue, config):
-    """
-    多进程进行分词处理
-    """
-
-    process_name = config['process_name']
-
-    print('进程:%s -> 分词处理开始' % process_name)
-
-    # 获取待分词的数据
-    lines = data[config['current_pos']:config['end_pos']]
-
-    # 统计需要处理的数据量
-    total_num = len(lines)
-    print("进程:%s ->剩余待处理数量:%d" % (process_name, total_num))
-
-    msg_content = {
-        'payload': []
-    }
-
-    for i, line in enumerate(lines):
-        # 数据处理
-        line = line.replace("\n", "")
-        # 分词
-        word_root = cut_word(line)
-        # 
-        msg_content['payload'].append({"key": line, "value": word_root})
-        
-        if len(msg_content) >= SEND_INTERNAL:
-            queue.put(msg_content)
-            msg_content = {
-                'payload': []
-            }
-        # 处理进度提示
-        if i > 0 and i % PROCESS_TIPS_INTERNAL == 0:
-            print("进程:%s -> 当前处理进度:%d / %d" % (process_name, i, total_num))
-    
-    queue.put(msg_content)
-
-    print('进程:%s -> 分词处理结束' % process_name)
-
-def main():
-
-    print("开始时间:", datetime.datetime.now())
-
-    # 处理进程容器
-    process_list = []
-    # 配置文件容器
-    config_list = []
-
-    # 设置多进程共享变量
-    manager = Manager()
-    # 多进程共享的数据源
-    global_list = manager.list()
-    # 多进程通信队列
-    global_queue = manager.Queue()
-
-    print("加载数据")
-    with open(DATA_FILE, "r", encoding="UTF-8") as f:
-        if IS_TEST_MODE:
-            print("当前处于测试模式,测试数据量:%d" % TEST_DATA_NUM)
-            global_list.extend(f.readlines()[:TEST_DATA_NUM])
-        else:
-            global_list.extend(f.readlines())
-    
-    total_len = len(global_list)
-    count = math.ceil(total_len / PRODUCER_NUM)
-    print("待处理总数量:%d, 数量区间:%d" % (total_len, count))
-
-    # 构造配置
-    for i in range(PRODUCER_NUM):
-        start_pos = i * count
-        end_pos = i * count + count
-        if end_pos >= total_len :
-            end_pos = None
-        cut_config = {
-            "start_pos": start_pos,
-            "current_pos": start_pos,
-            "end_pos": end_pos,
-            "process_name": "线程-%d" % i
-        }
-        config_list.append(cut_config)
-
-    print("配置", config_list)
-
-    # 启动消费者
-    cosumer = Process(target=consumer, args=(global_queue,))
-    cosumer.start()
-
-    # 启动生产者
-    for i, config in enumerate(config_list):
-        p = Process(target=producer, args=(global_list, global_queue, config))
-        p.start()
-        process_list.append(p)
-
-    for p in process_list:
-        p.join()
-    
-    # 给消费者发送结束指令
-    global_queue.put({"command":"quit"})
-    # 等待消费者结束执行
-    cosumer.join()
-
-    print("结束时间:", datetime.datetime.now())
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 147
cut_statistics.py

@@ -1,147 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import os
-import jieba
-
-# 待处理的数据文件
-DATA_KEYWORD_FILE = "E:\Download\怎么长尾词_1655561719.csv"
-
-# 输出的结果文件
-CUT_OUTPUT_FILE = "./data/分词与词频统计结果.csv"
-
-# 文件编码格式
-ENCODING_CHARSET = "UTF-8"
-
-# 停用词
-STOP_WORD_DIR = "./data/stopwords"
-
-# 间隔进度提示
-INTERNAL_NUM = 50000
-
-def cut_word_and_statistics(data):
-
-    """
-    分词并统计词频
-    """
-
-    print("开始执行分词操作并进行词频统计")
-
-    total_num = len(data)
-    print("共需处理数据:%d" % total_num)
-
-    # 分词结果容器
-    key_dict = {}
-
-    # 跳过开头两行
-    for i, item in enumerate(data):
-        # 只需要第一列的数据
-        longTailKey = item.split(",")[0]
-
-        longTailKey = longTailKey.replace("\n", "")
-        
-        # 分词
-        cutWord = jieba.cut_for_search(longTailKey)
-
-        # 统计
-        for word in cutWord:
-            if word in key_dict:
-                key_dict[word] = key_dict[word] + 1
-            else:
-                key_dict[word] = 1
-        
-        # 进度提示
-        if i % INTERNAL_NUM == 0:
-            print("当前分词进度 %d / %d" % (i, total_num))
-
-    print("根据词频进行倒序排列")
-
-    # 根据词频倒序排列
-    sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
-
-    print("分词结束")
-
-    return sorted_key_dict
-
-
-def load_stop_words():
-    """
-    加载停用词列表
-    """
-    
-    print("加载停用词 - 开始")
-
-    # 停用词容器
-    stop_word = []
-
-    stop_word_files = os.listdir(STOP_WORD_DIR)
-
-    for file in stop_word_files:
-        stop_word_file = os.path.join(STOP_WORD_DIR, file)
-        with open(stop_word_file, encoding=ENCODING_CHARSET) as f:
-            for item in f:
-                stop_word.append(item.replace("\n",""))
-
-    print("去重前,停用词数量:", len(stop_word))
-
-    stop_word = list(set(stop_word))
-
-    print("去重后,停用词数量:", len(stop_word))
-
-    print("加载停用词 - 结束")
-
-    return stop_word
-
-
-def filter_stop_word(word_root: dict) :
-    """
-    对分词结果过滤停用词
-    """
-
-    print("过滤停用词 - 开始")
-
-    # 加载停用词
-    stop_word = load_stop_words()
-    
-    print("过滤前,总分词数量:%d" % len(word_root))
-
-    # 过滤停用词
-    word_root_filter = dict((key, value) for key , value in word_root if key not in stop_word)
-
-    print("过滤后,总分词数量:%d" % len(word_root_filter))
-
-    print("过滤停用词 - 结束")
-
-    return word_root_filter
-
-def main():
-
-    print("开始")
-
-    if not os.path.exists(DATA_KEYWORD_FILE):
-        raise Exception("待处理的数据文件不存在:%s" % DATA_KEYWORD_FILE)
-
-    # 读取数据
-    print("从待处理的数据文件中读取数据")
-    lines = None
-    with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f:
-        lines = f.readlines()
-    
-    # 执行分词和词频统计
-    word_root = cut_word_and_statistics(lines[:100])
-
-    # 过滤停用词
-    word_root_filter = filter_stop_word(word_root)
-        
-    # 导出过滤后的数据,不要表头和行号
-    # print("导出过滤后的结果")
-    # word_root_dataframe.to_csv(CUT_OUTPUT_FILE, header=False, index=False)
-    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
-        for item in word_root_filter.items():
-            f.write("%s,%d\n" % item)
-
-    print("结束")
-
-
-if __name__ == '__main__':
-    main()
-

+ 0 - 185
data/category/白凉粉是什么东西.txt

@@ -1,185 +0,0 @@
-白凉粉是什么东西
-
-铁粉是什么东西
-白带是什么东西
-粉粉是什么东西
-什么东西是白的
-白膜是什么东西
-黄白是什么东西
-果粉是什么东西
-粉刺是什么东西
-白莲是什么东西
-血粉是什么东西
-白银是什么东西
-蛋白是什么东西
-白土是什么东西
-臭粉是什么东西
-鸡粉是什么东西
-淀粉是什么东西
-蛋白粉是什么东西
-钛白粉什么东西
-粉虫是什么东西
-冰粉是什么东西
-生粉是什么东西
-瑞粉是什么东西
-粉尘是什么东西
-太粉是什么东西
-食粉是什么东西
-海粉是什么东西
-白胶是什么东西
-太白粉是什么东西
-白术是什么东西
-白粮是什么东西
-白果是什么东西
-粉贝是什么东西
-白子是什么东西
-肠粉是什么东西
-葱白是什么东西
-白洞是什么东西
-藕粉是什么东西
-茭白是什么东西
-干粉是什么东西
-糖粉是什么东西
-水粉是什么东西
-鱼白是什么东西
-磷粉是什么东西
-口白是什么东西
-菊粉是什么东西
-鞋粉是什么东西
-澄粉是什么东西
-白柑是什么东西
-白矾是什么东西
-牙粉是什么东西
-粉葛是什么东西
-什么东西是白色
-粉刺白色东西是什么
-黑粉是什么东西
-面粉是什么东西
-太白粉水是什么东西
-粉嫩是什么东西
-白薯是什么东西
-石粉是什么东西
-凉薯是什么东西
-肽粉是什么东西
-粉瘤是什么东西
-粉扑是什么东西
-麻粉是什么东西
-白芷是什么东西
-白练是什么东西
-凉粉是什么东西做的
-醋白是什么东西
-矿粉是什么东西
-钼粉是什么东西
-凉是什么东西
-粉末是什么东西
-结粉是什么东西
-白酱是什么东西
-白浆是什么东西
-烟粉是什么东西
-薤白是什么东西
-白凉粉是什么东西做的
-什么东西是米白
-钛白粉是什么东西
-小粉是什么东西
-粉沫是什么东西
-墨粉是什么东西
-什么东西是粉
-白条是什么东西
-白虎是什么东西
-凉粉是什么东西
-花粉是什么东西
-海白是什么东西
-凉霸是什么东西
-什么东西是凉的
-骨粉是什么东西
-白点是什么东西
-粉云是什么东西
-什么东西是黑白
-草粉是什么东西
-药粉是什么东西
-什么东西是凉性
-白水是什么东西
-金粉是什么东西
-白斑是什么东西
-云粉是什么东西
-发粉是什么东西
-菌粉是什么东西
-炸粉是什么东西
-白沫是什么东西
-根粉是什么东西
-蜜粉是什么东西
-丁粉是什么东西
-粉底是什么东西
-粉姜是什么东西
-白宝是什么东西
-粉画是什么东西
-白油是什么东西
-奥粉是什么东西
-墙粉是什么东西
-白羊是什么东西
-牛粉是什么东西
-白云是什么东西
-羊粉是什么东西
-白盐是什么东西
-散粉是什么东西
-鱼粉是什么东西
-上白面粉是什么东西
-白灰是什么东西
-铜粉是什么东西
-白蜡是什么东西
-粉饼是什么东西
-白碱是什么东西
-饼粉是什么东西
-葛粉是什么东西
-铅粉是什么东西
-羊白是什么东西
-铑粉是什么东西
-该粉是什么东西
-白泥是什么东西
-什么东西是白心
-蛋白粉是些什么东西
-什么东西是凉肠
-白霍是什么东西
-白榉是什么东西
-白菌是什么东西
-干的白凉粉是什么东西
-凉粉熬的是什么东西
-隥粉是什么东西
-粉醇是什么东西
-凉粉是什么东西做了
-老凉粉是什么东西
-粉欧是什么东西
-食用的白凉粉是什么东西
-直粉是什么东西
-白望是什么东西
-白凉粉是什么东西制做的
-白氿是什么东西
-奶白是什么东西
-肝蛋白粉是什么东西
-花生凉粉是什么东西
-奴粉是什么东西
-白宇是什么东西
-白粉学校是什么东西
-粉贴是什么东西
-5白是什么东西
-凉粉浆是什么东西
-铲粉是什么东西
-白凉粉是什么东西怎么用
-白腿是什么东西
-梨白粉病是什么东西
-白塘是什么东西
-请问白凉粉是什么东西做的
-罩粉是什么东西
-白醋是什么东西
-炸东西粉是什么
-米粉是什么东西
-芯粉是什么东西
-白糠是什么东西
-s粉是什么东西
-白色粉刺是什么东西
-果冻白凉粉是什么东西啊
-吸的白粉是什么东西
-鸟粉是什么东西
-虎粉是什么东西
-白色粉圆是什么东西
-肩粉是什么东西

+ 0 - 44
data/category/腋下长了一个小疙瘩是什么东西.txt

@@ -1,44 +0,0 @@
-腋下长了一个小疙瘩是什么东西
-
-下巴长了一个东西是什么
-腋下长个东西是什么
-腋下长了一个小肉芽什么东西
-腋下长了个大疙瘩很疼是什么东西
-食指下面长了一个疙瘩是什么东西
-腋下长了小疙瘩是什么东西不疼
-下嘴唇长了个疙瘩是什么东西
-腋下长一小疙瘩很痛是什么东西
-腋下长了个坨是什么东西
-腋下痒长了一个疙瘩是什么东西
-脖子长了一个小疙瘩是什么东西
-腋下长了一个小肉球什么东西
-腋下长了个小软块是什么东西
-脚上长了个小疙瘩是什么东西
-腋下长几个小疙瘩是什么东西
-手臂长了一个小疙瘩是什么东西
-腋下长小红疙瘩是什么东西
-舌头长了个小疙瘩是什么东西
-腋下长了一个小长条是什么东西
-肋骨下面长了一个疙瘩是什么东西
-腋下长了一个圆疙瘩是什么东西
-脚根部长了一个小疙瘩是什么东西
-下巴长了个疙瘩是什么东西
-腋窝前长了一个小疙瘩是什么东西
-腋下偶尔长一个小疙瘩是什么东西
-副乳长了一个小疙瘩是什么东西
-腋下长了一条小疙瘩是什么东西
-腋下长了一个小硬包是什么东西
-腋下皮下长了小疙瘩是什么东西
-腋下长了一个小黑疙瘩是什么东西
-腋窝里面长了个小疙瘩是什么东西
-腋下长了几个小疙瘩什么东西
-腋下长了一团小疙瘩是什么东西
-胳肢窝长了一个小疙瘩是什么东西
-胸上长了一个小疙瘩是什么东西
-腋下长了一片红小疙瘩是什么东西
-腋下长了几个硬疙瘩是什么东西
-腋下长了几个个小疙瘩是什么东西
-下面长了个小疙瘩是什么东西
-腋下长了一些小红疙瘩是什么东西
-腋下长了一串小疙瘩是什么东西
-腋下长了个小疙瘩是什么原因

+ 52 - 0
key.py

@@ -0,0 +1,52 @@
+# -*- coding:utf-8 -*-
+
+import logging
+import config
+import tools
+import jieba
+import datetime
+import mmap
+
+TITLE = "关键词表 生成"
+
+
+def main():
+    
+    # 日志配置初始化
+    tools.init_log()
+    tools.log_start_msg(TITLE)
+
+    with open(config.MERGE_FILE, "r", encoding=config.ENCODING_CHARSET) as fmerge, \
+        open(config.KEY_FILE, "w", encoding=config.ENCODING_CHARSET) as fw, \
+        mmap.mmap(fmerge.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
+
+        # TODO 
+        # 这里可能有IO优化的余地
+        # 这里可以不用mmap,改用一条一条readline()进行读取
+        # 进度提示也不完整
+
+            count = -1
+            total_num = fmmap.size()
+
+            while True:
+                count = count + 1
+                # 读取关键词
+                word = fmmap.readline().decode("UTF-8").replace("\r","").replace("\n","")
+
+                # 读取不到任何内容结束执行
+                if not word :
+                    break
+                
+                # 分词
+                word_root = list(jieba.cut_for_search(word))
+
+                # 写入文件中
+                fw.write("%d,%s,%s\n"%(count,word,word_root))
+
+                # 进度提示
+                tools.tip(total_num, fmmap.tell(), False)
+
+    tools.log_end_msg(TITLE)
+
+if __name__ == '__main__':
+    main()

+ 58 - 0
key_index.py

@@ -0,0 +1,58 @@
+# -*- coding:utf-8 -*-
+
+import config
+import tools
+import mmap
+import logging
+
+TITLE = "关键词索引"
+
+def main():
+    # 日志配置初始化
+    tools.init_log()
+    tools.log_start_msg(TITLE)
+
+    # 关键词索引容器
+    key_index_cache = {}
+
+    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
+        mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
+        
+        # 总大小
+        total_num = fmmap.size()
+
+        while True:
+            # 读取光标位置
+            cur_pos = fmmap.tell()
+            # 把光标移动到下一行
+            line = fmmap.readline().decode(config.ENCODING_CHARSET)
+            # 如果没有数据则结束
+            if not line :
+                break
+            
+            # 获取关键词序号
+            index = line.index(",")
+
+            # 建立关键词序号和位置的关系
+            key_index_cache[line[:index]]=cur_pos
+            
+            # 进度显示
+            tools.tip_in_size(total_num, cur_pos)
+        
+        # 保存索引
+        tools.save_obj(config.KEY_INDEX_CACHE, key_index_cache)
+
+    tools.log_end_msg(TITLE)
+    
+
+if __name__ == '__main__':
+    main()
+
+    # key_index_cache = tools.load_obj(config.KEY_INDEX_CACHE)
+
+    # with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
+    #     mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
+    #         for key,value in key_index_cache.items():
+    #             fmmap.seek(value)
+    #             line = fmmap.readline().decode(config.ENCODING_CHARSET)
+    #             logging.debug("key: %s, value: %d, 内容:%s" % (key, value, line))

+ 91 - 0
key_reverse.py

@@ -0,0 +1,91 @@
+# -*- coding:utf-8 -*-
+
+import sys
+from time import time
+import os
+import config
+import tools
+import ast
+import re
+import stop_word
+
+TITLE = "关键词倒排文件"
+
+
+
+def main():
+    """
+    构建待排表
+    """
+
+    tools.init_log()
+    tools.log_start_msg(TITLE)
+
+    # 提取规则
+    s = r"(\d+),([^,]*),(.*)"
+    pattern = re.compile(s, re.I)
+
+    # 倒排表 容器
+    key_reverse = {}
+
+    # 停用表
+    stop_word_cache = stop_word.load_stop_word()
+
+    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey:
+        
+        # 获取文件总大小,获取后需要复原光标位置
+        fkey.seek(0, os.SEEK_END)
+        total_num = fkey.tell()
+        fkey.seek(0)
+
+        while True:
+            # 获取当前处理位置
+            cur_pos = fkey.tell()
+            
+            # 进度提示
+            tools.tip_in_size(total_num, cur_pos)
+
+            # 读取关键词数据
+            line = fkey.readline()
+
+            # 如果到行尾则结束
+            if not line:
+                break
+
+            # 提取数据
+            m = pattern.match(line)
+            # 获取关键词序号
+            index = m.group(1)
+            # 获取词根
+            key_root = m.group(3)
+            # 转换成真正的list对象
+            for item in ast.literal_eval(key_root):
+                
+                # 排除停用词
+                if item in stop_word_cache:
+                    continue
+
+                # 构建倒排表
+                val = key_reverse.get(item)
+                if val:
+                    key_reverse[item].append(index) 
+                else:
+                    key_reverse[item]=[]
+                    key_reverse[item].append(index)
+
+    # 保存到本地文件
+    with open(config.KEY_REVERSE_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
+        for key, value in key_reverse.items():
+            f.write("%s,%s\n" % (key, value))
+
+    tools.log_end_msg(TITLE)
+
+if __name__ == "__main__":
+    main()
+
+    # 测试加载耗时
+    # start = time()
+    # key_reverse_cache = tools.load_obj(config.KEY_REVERSE_CACHE)
+    # end = time()
+    # print("占用大小:", sys.getsizeof(key_reverse_cache))
+    # print("加载耗时:", end-start)

+ 57 - 0
key_reverse_index.py

@@ -0,0 +1,57 @@
+# -*- coding:utf-8 -*-
+
+import sys
+from time import time
+import os
+import config
+import tools
+import ast
+import re
+import stop_word
+import mmap
+
+TITLE = "关键词倒排索引"
+
+def main():
+    # 日志配置初始化
+    tools.init_log()
+    tools.log_start_msg(TITLE)
+
+    # 关键词倒排索引容器
+    key_reverse_index_cache = {}
+
+    with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
+        mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
+        
+        # 总大小
+        total_num = fmmap.size()
+
+        while True:
+            # 读取光标位置
+            cur_pos = fmmap.tell()
+            # 把光标移动到下一行
+            line = fmmap.readline().decode(config.ENCODING_CHARSET)
+
+            # 如果没有数据则结束
+            if not line :
+                break
+            
+            # 获取词根位置,建立词根和位置的关系
+            index = line.index(",")
+            key_reverse_index_cache[line[:index]]=cur_pos
+            
+            # 进度显示
+            tools.tip_in_size(total_num, cur_pos)
+        
+        # 保存索引
+        tools.save_obj(config.KEY_REVERSE_INDEX_CACHE, key_reverse_index_cache)
+
+    tools.log_end_msg(TITLE)
+
+if __name__ == "__main__":
+    # main()
+    key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
+    for i, item in enumerate(key_reverse_index_cache):
+        if i > 10:
+            break
+        print(item)

+ 28 - 0
logging.conf

@@ -0,0 +1,28 @@
+[loggers]
+keys=root
+
+[handlers]
+keys=fileHandler,consoleHandler
+
+[formatters]
+keys=simpleFormatter
+
+[logger_root]
+level=INFO
+handlers=fileHandler,consoleHandler
+
+[handler_consoleHandler]
+class=StreamHandler
+args=(sys.stdout,)
+level=DEBUG
+formatter=simpleFormatter
+
+[handler_fileHandler]
+class=FileHandler
+args=('../all.log', 'a', "UTF-8")
+level=DEBUG
+formatter=simpleFormatter
+
+[formatter_simpleFormatter]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
+datefmt=

+ 18 - 15
merge_cut_word.py → merge.py

@@ -1,20 +1,19 @@
 # -*- coding: utf-8 -*-
 
+import config
 import os
+import tools
+import logging
 import zipfile
 
+TITLE= "拓展词合并"
+
 # 带合并的文件目录
 DATA_DIR = "E:\Download\长尾关键词\普通-p"
 
-# 合并后输出文件
-MERGE_OUTPUT_FILE = "./data/合并结果.txt"
-
-# 排除合并的文件
-EXCLUDE_FILES = ['打开乱码如何处理?.txt']
-
 def get_files(path):
     '''
-    读取文件夹下的文件名称
+    读取文件夹下的文件列表
     '''
     file_list = []
     for file in os.listdir(path):
@@ -38,17 +37,19 @@ def merge_file_content():
     ----------
     """
 
-    print("----------- 开始 -----------")
+    # 日志初始化
+    tools.init_log()
+
+    tools.log_start_msg(TITLE)
 
     # 获取文件列表
-    
     files = get_files(DATA_DIR)
 
+    # 总文件数
     total_num = len(files)
+    logging.info("待处理文件数:%d" % total_num)
 
-    print("读取文件列表,待处理文件数:%d" % total_num)
-
-    with open(MERGE_OUTPUT_FILE, "w", encoding="utf-8") as f:
+    with open(config.MERGE_FILE, "w", encoding="utf-8") as f:
 
         for i, file in enumerate(files):
             zfile = zipfile.ZipFile(file)
@@ -59,10 +60,10 @@ def merge_file_content():
                 realname = filename.encode('cp437').decode('gbk')
                 
                 # 排除无效文件
-                if realname in EXCLUDE_FILES:
+                if realname in config.MERGE_EXCLUDE_FILES:
                     continue
 
-                print("正在处理文件: %s, 当前进度:%d / %d" % (realname, i, total_num))
+                logging.info("正在处理文件: %s" % realname)
 
                 # 读取压缩文件中的文件
                 with zfile.open(filename) as file_content:
@@ -73,8 +74,10 @@ def merge_file_content():
                         # 只需要第一列的数据
                         f.write(split[0])
                         f.write("\n")
+                
+            tools.tip(total_num, i)
 
-    print("----------- 结束 -----------")
+    tools.log_end_msg(TITLE)
 
 if __name__ == '__main__':
     merge_file_content()

+ 65 - 0
stop_word.py

@@ -0,0 +1,65 @@
+# -*- coding:utf-8 -*-
+
+from datetime import datetime
+import os
+import time
+import tools
+import config
+import pickle
+import logging
+
+TITLE = "停用词"
+
+def load_stop_word():
+    """
+    加载停用词
+    """
+
+    # 判断是否存在缓存
+    if os.path.exists(config.STOP_WORD_CACHE):
+        logging.debug("存在停用词缓存")
+        return tools.load_obj(config.STOP_WORD_CACHE)
+
+    logging.debug("正在构建停用词缓存")
+
+    # 停用词容器
+    stop_word = []
+
+    # 构建停用词列表
+    stop_word_files = os.listdir(config.STOP_WORD_DIR)
+    for file in stop_word_files:
+        stop_word_file = os.path.join(config.STOP_WORD_DIR, file)
+        with open(stop_word_file, encoding=config.ENCODING_CHARSET) as f:
+            for item in f:
+                # 移除换行符
+                stop_word.append(item.replace("\n",""))
+    # 去重
+    stop_word = list(set(stop_word))
+
+    # 把list改成dict提升检索速度
+    stop_word_dict = {}
+    for item in stop_word:
+        stop_word_dict[item]=None
+    
+    logging.debug("把停用词缓存保存到本地")
+
+    # 保存本地作为缓存
+    tools.save_obj(config.STOP_WORD_CACHE, stop_word_dict)
+    
+    return stop_word_dict
+
+if __name__ == '__main__':
+
+    tools.init_log()
+    tools.log_start_msg(TITLE)
+
+    stop_word = load_stop_word()
+
+    start = time.time()
+    for i in range(1400*10000):
+        for item in ["总之", "风雨无阻","千"]:
+            item in stop_word
+    end = time.time()
+    print("耗时:", end - start)
+
+    tools.log_end_msg(TITLE)

+ 184 - 0
tools.py

@@ -0,0 +1,184 @@
+# -*- coding:utf-8 -*-
+
+import math
+import logging
+import os
+import config
+import logging.config
+import pickle
+import mmap
+
+TITLE = "工具类"
+
+tip_internal_cache = {}
+
+def init_log():
+    """
+    日志初始化工具
+    """
+    # 读取日志配置文件内容
+    logging.config.fileConfig('./logging.conf')
+
+    # 用一个没有在配置文件中定义的logger名称来创建一个日志器logger
+    return logging.getLogger()
+
+def log_start_msg(msg):
+    """
+    执行开始时的简易日志输出
+    """
+    logging.info("-----------------%s 开始-----------------" % msg)
+
+def log_end_msg(msg):
+    """
+    执行结束时的简易日志输出
+    """
+    logging.info("-----------------%s 结束-----------------" % msg)
+
+def get_tip_internal(total_num):
+    """
+    计算进度提示间隔
+    """
+    # 尝试从缓存中获取
+    internal = tip_internal_cache.get(total_num)
+    # 不存在则进行计算并放入缓存中
+    if not internal:
+        internal = math.ceil(total_num * config.PRECENT_TIPS)
+        tip_internal_cache[total_num] = internal
+    return internal
+    
+
+def tip(total_num, cur_num, is_zero_base=True):
+    """
+    简易进度提示
+
+    total_num 总数量
+
+    cur_num 当前进度(0基)
+
+    internal 提示间隔
+    """
+
+    # TODO
+    # 修改成百分比提示
+
+    internal = get_tip_internal(total_num)
+
+    # cur_num + 1 是0基修正
+    if is_zero_base:
+        cur_num = cur_num + 1
+
+    # 进度提示
+    if cur_num == total_num:
+        logging.info("当前进度 %d / %d" % (total_num, total_num))
+    elif cur_num % internal == 0:
+        logging.info("当前进度 %d / %d" % (cur_num, total_num))
+
+def tip_in_size(total_size, cur_pos):
+    """
+    简易进度提示(用于不知道总行数的情形)
+
+    total_size 总数量
+
+    cur_num 当前进度
+    """
+
+    # 尝试从缓存中获取
+    tip_internal = tip_internal_cache.get(total_size)
+    if not tip_internal:
+        # 不存在缓存,构建 提示检查点 和 提示间隔 信息
+        internal = math.ceil(total_size * config.PRECENT_TIPS)
+        tip_internal= {
+            "check_point": cur_pos,
+            "internal": internal
+        }
+        # 放入缓存
+        tip_internal_cache[total_size] = tip_internal
+    
+    # 当前位置超过提示检查点则显示进度
+    if cur_pos >= tip_internal["check_point"]:
+
+        logging.info("当前进度 %d / %d" % (cur_pos, total_size))
+
+        # 修改 提示检查点
+        check_point = tip_internal["check_point"]
+        internal = tip_internal["internal"]
+
+        while cur_pos >= check_point:
+
+            check_point = check_point + internal
+
+            # 如果 提示检查点大于总值,则置为总值
+            if check_point > total_size:
+                check_point = total_size
+                # 如果不手动中断会陷入循环
+                break
+        
+        # 更新 提示检查点
+        tip_internal["check_point"] = check_point
+
+
+    
+
+def save_obj(path, obj):
+    """
+    保存对象至本地
+    """
+    with open(path, "wb") as f:
+        pickle.dump(obj, f)
+
+def load_obj(path):
+    """
+    加载对象
+    """
+    with open(path, "rb") as f:
+        return pickle.load(f)
+
+if __name__ == "__main__":
+
+    init_log()
+
+    log_start_msg(TITLE)
+
+    # 测试普通提示
+    # total = 3
+    # for i in range(total):
+    #     tip(total, i)
+
+    # 测试mmap的提示
+    # with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
+    #     mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
+        
+    #     # 总大小
+    #     total_num = fmmap.size()
+
+    #     while True:
+    #         # 读取光标位置
+    #         cur_pos = fmmap.tell()
+    #         # 把光标移动到下一行
+    #         line = fmmap.readline()
+
+    #         # 进度显示
+    #         tip_in_size(total_num, cur_pos)
+
+    #         if not line:
+    #             break
+
+    # 测试逐行读取的进度提示
+    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey:
+
+        fkey.seek(0, os.SEEK_END)
+        total_num = fkey.tell()
+        fkey.seek(0)
+
+        while True:
+
+            cur_pos = fkey.tell()
+
+            line = fkey.readline()
+
+            tip_in_size(total_num, cur_pos)
+
+            if not line:
+                break;
+    
+    log_end_msg(TITLE)

+ 0 - 363
长尾关键词分析.ipynb

@@ -1,363 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import os\n",
-    "\n",
-    "ORIG_FILE = \"./data/范用性关键词-分词结果.csv\"\n",
-    "DEST_FILE = \"./data/范用性关键词-分词结果-过滤停用词.csv\"\n",
-    "DEST_FILE_FILTER = \"./data/范用性关键词-分词结果-过滤停用词-词频大于300.csv\"\n",
-    "STOP_WORD_DIR = \"./data/stopwords\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv(ORIG_FILE, names=['key','count'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>key</th>\n",
-       "      <th>count</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>怎么</td>\n",
-       "      <td>1051516</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>的</td>\n",
-       "      <td>123009</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>怎么办</td>\n",
-       "      <td>93937</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>怎么样</td>\n",
-       "      <td>91070</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>做</td>\n",
-       "      <td>63034</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>116625</th>\n",
-       "      <td>做文</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>116626</th>\n",
-       "      <td>提微商</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>116627</th>\n",
-       "      <td>仰卧</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>116628</th>\n",
-       "      <td>起坐</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>116629</th>\n",
-       "      <td>仰卧起坐</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>116630 rows × 2 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         key    count\n",
-       "0         怎么  1051516\n",
-       "1          的   123009\n",
-       "2        怎么办    93937\n",
-       "3        怎么样    91070\n",
-       "4          做    63034\n",
-       "...      ...      ...\n",
-       "116625    做文        1\n",
-       "116626   提微商        1\n",
-       "116627    仰卧        1\n",
-       "116628    起坐        1\n",
-       "116629  仰卧起坐        1\n",
-       "\n",
-       "[116630 rows x 2 columns]"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "去重前,停用词数量: 5778\n",
-      "去重后,停用词数量: 2462\n"
-     ]
-    }
-   ],
-   "source": [
-    "# 导入停用词\n",
-    "\n",
-    "stop_word = [];\n",
-    "\n",
-    "stop_word_files = os.listdir(STOP_WORD_DIR)\n",
-    "for file in stop_word_files:\n",
-    "    stop_word_file = os.path.join(STOP_WORD_DIR, file)\n",
-    "    with open(stop_word_file) as f:\n",
-    "        for item in f:\n",
-    "            stop_word.append(item.replace(\"\\n\",\"\"))\n",
-    "print(\"去重前,停用词数量:\", len(stop_word))\n",
-    "stop_word = list(set(stop_word))\n",
-    "print(\"去重后,停用词数量:\", len(stop_word))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 过滤停用词\n",
-    "df = df[df.apply(lambda row : row['key'] not in stop_word, axis=1)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 107,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 导出过滤后的数据,不要表头和行号\n",
-    "df.to_csv(DEST_FILE, header=False, index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>count</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>115534.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>mean</th>\n",
-       "      <td>27.613802</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>std</th>\n",
-       "      <td>311.900416</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>min</th>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25%</th>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>50%</th>\n",
-       "      <td>2.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>75%</th>\n",
-       "      <td>6.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>max</th>\n",
-       "      <td>63034.000000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "               count\n",
-       "count  115534.000000\n",
-       "mean       27.613802\n",
-       "std       311.900416\n",
-       "min         1.000000\n",
-       "25%         1.000000\n",
-       "50%         2.000000\n",
-       "75%         6.000000\n",
-       "max     63034.000000"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "大于1000的数量: 418\n",
-      "大于500的数量: 1035\n",
-      "大于400的数量: 1340\n",
-      "大于300的数量: 1883\n",
-      "大于250的数量: 2282\n",
-      "大于100的数量: 5104\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"大于1000的数量:\", df[df['count'] > 1000].count().key)\n",
-    "print(\"大于500的数量:\", df[df['count'] > 500].count().key)\n",
-    "print(\"大于400的数量:\", df[df['count'] > 400].count().key)\n",
-    "print(\"大于300的数量:\", df[df['count'] > 300].count().key)\n",
-    "print(\"大于250的数量:\", df[df['count'] > 250].count().key)\n",
-    "print(\"大于100的数量:\", df[df['count'] > 100].count().key)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 过滤词频小于300的数据\n",
-    "df=df[df['count']>=300]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 导出过滤后的数据,不要表头和行号,只要关键词列\n",
-    "df.to_csv(DEST_FILE_FILTER, header=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[5:7].to_csv(\"./data/多线城-3.csv\", header=False)"
-   ]
-  }
- ],
- "metadata": {
-  "interpreter": {
-   "hash": "679ecc657d123b537eb7946f00483c298ba68f4074c79757b9e8823d90af42fb"
-  },
-  "kernelspec": {
-   "display_name": "Python 3.9.0 ('jieba')",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.0"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}