|
@@ -5,6 +5,12 @@ import utils
|
|
|
import jieba
|
|
import jieba
|
|
|
import zipfile
|
|
import zipfile
|
|
|
|
|
|
|
|
|
|
+# 文件后缀:长尾词.txt
|
|
|
|
|
+FILE_SUFFIX_LONG_TAIL = "_长尾词.txt"
|
|
|
|
|
+
|
|
|
|
|
+# 文件后缀:长尾词_合并.txt
|
|
|
|
|
+FILE_SUFFIX_LONG_TAIL_MERGE = "_长尾词_合并.txt"
|
|
|
|
|
+
|
|
|
|
|
|
|
|
def extract_word_from_5118(file_path: str):
|
|
def extract_word_from_5118(file_path: str):
|
|
|
"""
|
|
"""
|
|
@@ -40,13 +46,41 @@ def extract_word_from_5118(file_path: str):
|
|
|
word_container.add(split[0])
|
|
word_container.add(split[0])
|
|
|
|
|
|
|
|
output_file_name = real_name[0:real_name.index("--")]
|
|
output_file_name = real_name[0:real_name.index("--")]
|
|
|
- output_file_path = os.path.join(file_path, output_file_name + "_长尾词.txt")
|
|
|
|
|
|
|
+ output_file_path = os.path.join(file_path, output_file_name + FILE_SUFFIX_LONG_TAIL)
|
|
|
with open(output_file_path, "w", encoding="utf-8") as f:
|
|
with open(output_file_path, "w", encoding="utf-8") as f:
|
|
|
for item in word_container:
|
|
for item in word_container:
|
|
|
f.write(item)
|
|
f.write(item)
|
|
|
f.write("\n")
|
|
f.write("\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def merge_word(file_path: str):
|
|
|
|
|
+ """
|
|
|
|
|
+ 合并长尾词(带去重)
|
|
|
|
|
+ :param file_path: 待处理文件夹路径
|
|
|
|
|
+ :return: None
|
|
|
|
|
+ """
|
|
|
|
|
+ # 获取文件列表
|
|
|
|
|
+ file_list = []
|
|
|
|
|
+ for file in os.listdir(file_path):
|
|
|
|
|
+ if file.endswith(FILE_SUFFIX_LONG_TAIL):
|
|
|
|
|
+ file_list.append(os.path.join(file_path, file))
|
|
|
|
|
+
|
|
|
|
|
+ # 长尾词集合容器
|
|
|
|
|
+ word_set = set()
|
|
|
|
|
+
|
|
|
|
|
+ # 读取数据并排重
|
|
|
|
|
+ for i, file in enumerate(file_list):
|
|
|
|
|
+ with open(file, "r", encoding="utf-8") as f:
|
|
|
|
|
+ for word in f:
|
|
|
|
|
+ word_set.add(word.replace("\n", ""))
|
|
|
|
|
+
|
|
|
|
|
+ # 保存合并结果
|
|
|
|
|
+ with open(os.path.join(file_path, str(len(file_list)) + FILE_SUFFIX_LONG_TAIL_MERGE), "w", encoding="utf-8") as f:
|
|
|
|
|
+ for item in word_set:
|
|
|
|
|
+ f.write(item)
|
|
|
|
|
+ f.write("\n")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def word_split_statistics(file_path: str):
|
|
def word_split_statistics(file_path: str):
|
|
|
"""
|
|
"""
|
|
|
分词统计
|
|
分词统计
|
|
@@ -61,7 +95,7 @@ def word_split_statistics(file_path: str):
|
|
|
stop_word_dict = utils.load_stop_word()
|
|
stop_word_dict = utils.load_stop_word()
|
|
|
|
|
|
|
|
for i, file in enumerate(file_list):
|
|
for i, file in enumerate(file_list):
|
|
|
- if not file.endswith("_长尾词.txt"):
|
|
|
|
|
|
|
+ if not file.endswith(FILE_SUFFIX_LONG_TAIL):
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
# 分词结果容器
|
|
# 分词结果容器
|
|
@@ -85,14 +119,17 @@ def word_split_statistics(file_path: str):
|
|
|
# 根据词频进行倒序排列
|
|
# 根据词频进行倒序排列
|
|
|
sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
|
|
sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
|
|
- output_file_name = file[file.rindex("\\")+1:file.index("_长尾词.txt")]
|
|
|
|
|
|
|
+ output_file_name = file[file.rindex("\\") + 1:file.index(FILE_SUFFIX_LONG_TAIL)]
|
|
|
output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
|
|
output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
|
|
|
with open(output_file_path, "w", encoding="UTF-8") as f:
|
|
with open(output_file_path, "w", encoding="UTF-8") as f:
|
|
|
for key, count in sorted_key_list:
|
|
for key, count in sorted_key_list:
|
|
|
f.write("%s,%d\n" % (key, count))
|
|
f.write("%s,%d\n" % (key, count))
|
|
|
|
|
|
|
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
- file_path = "E:\Download\原始词库"
|
|
|
|
|
- # file_path = "E:\Download\测试"
|
|
|
|
|
- extract_word_from_5118(file_path)
|
|
|
|
|
|
|
+ filePath = "../data"
|
|
|
|
|
+ # filePath = "E:\Download\测试"
|
|
|
|
|
+ # extract_word_from_5118(filePath)
|
|
|
|
|
+ merge_word(filePath)
|
|
|
# word_split_statistics(file_path)
|
|
# word_split_statistics(file_path)
|
|
|
|
|
+
|