{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "\n", "ORIG_FILE = \"./data/范用性关键词-分词结果.csv\"\n", "DEST_FILE = \"./data/范用性关键词-分词结果-过滤停用词.csv\"\n", "DEST_FILE_FILTER = \"./data/范用性关键词-分词结果-过滤停用词-词频大于300.csv\"\n", "STOP_WORD_DIR = \"./data/stopwords\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(ORIG_FILE, names=['key','count'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keycount
0怎么1051516
1123009
2怎么办93937
3怎么样91070
463034
.........
116625做文1
116626提微商1
116627仰卧1
116628起坐1
116629仰卧起坐1
\n", "

116630 rows × 2 columns

\n", "
" ], "text/plain": [ " key count\n", "0 怎么 1051516\n", "1 的 123009\n", "2 怎么办 93937\n", "3 怎么样 91070\n", "4 做 63034\n", "... ... ...\n", "116625 做文 1\n", "116626 提微商 1\n", "116627 仰卧 1\n", "116628 起坐 1\n", "116629 仰卧起坐 1\n", "\n", "[116630 rows x 2 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "去重前,停用词数量: 5778\n", "去重后,停用词数量: 2462\n" ] } ], "source": [ "# 导入停用词\n", "\n", "stop_word = [];\n", "\n", "stop_word_files = os.listdir(STOP_WORD_DIR)\n", "for file in stop_word_files:\n", " stop_word_file = os.path.join(STOP_WORD_DIR, file)\n", " with open(stop_word_file) as f:\n", " for item in f:\n", " stop_word.append(item.replace(\"\\n\",\"\"))\n", "print(\"去重前,停用词数量:\", len(stop_word))\n", "stop_word = list(set(stop_word))\n", "print(\"去重后,停用词数量:\", len(stop_word))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# 过滤停用词\n", "df = df[df.apply(lambda row : row['key'] not in stop_word, axis=1)]" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "# 导出过滤后的数据,不要表头和行号\n", "df.to_csv(DEST_FILE, header=False, index=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
count115534.000000
mean27.613802
std311.900416
min1.000000
25%1.000000
50%2.000000
75%6.000000
max63034.000000
\n", "
" ], "text/plain": [ " count\n", "count 115534.000000\n", "mean 27.613802\n", "std 311.900416\n", "min 1.000000\n", "25% 1.000000\n", "50% 2.000000\n", "75% 6.000000\n", "max 63034.000000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "大于1000的数量: 418\n", "大于500的数量: 1035\n", "大于400的数量: 1340\n", "大于300的数量: 1883\n", "大于250的数量: 2282\n", "大于100的数量: 5104\n" ] } ], "source": [ "print(\"大于1000的数量:\", df[df['count'] > 1000].count().key)\n", "print(\"大于500的数量:\", df[df['count'] > 500].count().key)\n", "print(\"大于400的数量:\", df[df['count'] > 400].count().key)\n", "print(\"大于300的数量:\", df[df['count'] > 300].count().key)\n", "print(\"大于250的数量:\", df[df['count'] > 250].count().key)\n", "print(\"大于100的数量:\", df[df['count'] > 100].count().key)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# 过滤词频小于300的数据\n", "df=df[df['count']>=300]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# 导出过滤后的数据,不要表头和行号,只要关键词列\n", "df.to_csv(DEST_FILE_FILTER, header=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "df[5:7].to_csv(\"./data/多线城-3.csv\", header=False)" ] } ], "metadata": { "interpreter": { "hash": "679ecc657d123b537eb7946f00483c298ba68f4074c79757b9e8823d90af42fb" }, "kernelspec": { "display_name": "Python 3.9.0 ('jieba')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }