|
|
@@ -0,0 +1,363 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 1,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import pandas as pd\n",
|
|
|
+ "import os\n",
|
|
|
+ "\n",
|
|
|
+ "ORIG_FILE = \"./data/范用性关键词-分词结果.csv\"\n",
|
|
|
+ "DEST_FILE = \"./data/范用性关键词-分词结果-过滤停用词.csv\"\n",
|
|
|
+ "DEST_FILE_FILTER = \"./data/范用性关键词-分词结果-过滤停用词-词频大于300.csv\"\n",
|
|
|
+ "STOP_WORD_DIR = \"./data/stopwords\""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 2,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "df = pd.read_csv(ORIG_FILE, names=['key','count'])"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 3,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "data": {
|
|
|
+ "text/html": [
|
|
|
+ "<div>\n",
|
|
|
+ "<style scoped>\n",
|
|
|
+ " .dataframe tbody tr th:only-of-type {\n",
|
|
|
+ " vertical-align: middle;\n",
|
|
|
+ " }\n",
|
|
|
+ "\n",
|
|
|
+ " .dataframe tbody tr th {\n",
|
|
|
+ " vertical-align: top;\n",
|
|
|
+ " }\n",
|
|
|
+ "\n",
|
|
|
+ " .dataframe thead th {\n",
|
|
|
+ " text-align: right;\n",
|
|
|
+ " }\n",
|
|
|
+ "</style>\n",
|
|
|
+ "<table border=\"1\" class=\"dataframe\">\n",
|
|
|
+ " <thead>\n",
|
|
|
+ " <tr style=\"text-align: right;\">\n",
|
|
|
+ " <th></th>\n",
|
|
|
+ " <th>key</th>\n",
|
|
|
+ " <th>count</th>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " </thead>\n",
|
|
|
+ " <tbody>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>0</th>\n",
|
|
|
+ " <td>怎么</td>\n",
|
|
|
+ " <td>1051516</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>1</th>\n",
|
|
|
+ " <td>的</td>\n",
|
|
|
+ " <td>123009</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>2</th>\n",
|
|
|
+ " <td>怎么办</td>\n",
|
|
|
+ " <td>93937</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>3</th>\n",
|
|
|
+ " <td>怎么样</td>\n",
|
|
|
+ " <td>91070</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>4</th>\n",
|
|
|
+ " <td>做</td>\n",
|
|
|
+ " <td>63034</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>...</th>\n",
|
|
|
+ " <td>...</td>\n",
|
|
|
+ " <td>...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>116625</th>\n",
|
|
|
+ " <td>做文</td>\n",
|
|
|
+ " <td>1</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>116626</th>\n",
|
|
|
+ " <td>提微商</td>\n",
|
|
|
+ " <td>1</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>116627</th>\n",
|
|
|
+ " <td>仰卧</td>\n",
|
|
|
+ " <td>1</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>116628</th>\n",
|
|
|
+ " <td>起坐</td>\n",
|
|
|
+ " <td>1</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>116629</th>\n",
|
|
|
+ " <td>仰卧起坐</td>\n",
|
|
|
+ " <td>1</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " </tbody>\n",
|
|
|
+ "</table>\n",
|
|
|
+ "<p>116630 rows × 2 columns</p>\n",
|
|
|
+ "</div>"
|
|
|
+ ],
|
|
|
+ "text/plain": [
|
|
|
+ " key count\n",
|
|
|
+ "0 怎么 1051516\n",
|
|
|
+ "1 的 123009\n",
|
|
|
+ "2 怎么办 93937\n",
|
|
|
+ "3 怎么样 91070\n",
|
|
|
+ "4 做 63034\n",
|
|
|
+ "... ... ...\n",
|
|
|
+ "116625 做文 1\n",
|
|
|
+ "116626 提微商 1\n",
|
|
|
+ "116627 仰卧 1\n",
|
|
|
+ "116628 起坐 1\n",
|
|
|
+ "116629 仰卧起坐 1\n",
|
|
|
+ "\n",
|
|
|
+ "[116630 rows x 2 columns]"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "execution_count": 3,
|
|
|
+ "metadata": {},
|
|
|
+ "output_type": "execute_result"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "df"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 4,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "去重前,停用词数量: 5778\n",
|
|
|
+ "去重后,停用词数量: 2462\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# 导入停用词\n",
|
|
|
+ "\n",
|
|
|
+ "stop_word = [];\n",
|
|
|
+ "\n",
|
|
|
+ "stop_word_files = os.listdir(STOP_WORD_DIR)\n",
|
|
|
+ "for file in stop_word_files:\n",
|
|
|
+ " stop_word_file = os.path.join(STOP_WORD_DIR, file)\n",
|
|
|
+ " with open(stop_word_file) as f:\n",
|
|
|
+ " for item in f:\n",
|
|
|
+ " stop_word.append(item.replace(\"\\n\",\"\"))\n",
|
|
|
+ "print(\"去重前,停用词数量:\", len(stop_word))\n",
|
|
|
+ "stop_word = list(set(stop_word))\n",
|
|
|
+ "print(\"去重后,停用词数量:\", len(stop_word))"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 5,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 过滤停用词\n",
|
|
|
+ "df = df[df.apply(lambda row : row['key'] not in stop_word, axis=1)]"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 107,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 导出过滤后的数据,不要表头和行号\n",
|
|
|
+ "df.to_csv(DEST_FILE, header=False, index=False)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 6,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "data": {
|
|
|
+ "text/html": [
|
|
|
+ "<div>\n",
|
|
|
+ "<style scoped>\n",
|
|
|
+ " .dataframe tbody tr th:only-of-type {\n",
|
|
|
+ " vertical-align: middle;\n",
|
|
|
+ " }\n",
|
|
|
+ "\n",
|
|
|
+ " .dataframe tbody tr th {\n",
|
|
|
+ " vertical-align: top;\n",
|
|
|
+ " }\n",
|
|
|
+ "\n",
|
|
|
+ " .dataframe thead th {\n",
|
|
|
+ " text-align: right;\n",
|
|
|
+ " }\n",
|
|
|
+ "</style>\n",
|
|
|
+ "<table border=\"1\" class=\"dataframe\">\n",
|
|
|
+ " <thead>\n",
|
|
|
+ " <tr style=\"text-align: right;\">\n",
|
|
|
+ " <th></th>\n",
|
|
|
+ " <th>count</th>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " </thead>\n",
|
|
|
+ " <tbody>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>count</th>\n",
|
|
|
+ " <td>115534.000000</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>mean</th>\n",
|
|
|
+ " <td>27.613802</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>std</th>\n",
|
|
|
+ " <td>311.900416</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>min</th>\n",
|
|
|
+ " <td>1.000000</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>25%</th>\n",
|
|
|
+ " <td>1.000000</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>50%</th>\n",
|
|
|
+ " <td>2.000000</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>75%</th>\n",
|
|
|
+ " <td>6.000000</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>max</th>\n",
|
|
|
+ " <td>63034.000000</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " </tbody>\n",
|
|
|
+ "</table>\n",
|
|
|
+ "</div>"
|
|
|
+ ],
|
|
|
+ "text/plain": [
|
|
|
+ " count\n",
|
|
|
+ "count 115534.000000\n",
|
|
|
+ "mean 27.613802\n",
|
|
|
+ "std 311.900416\n",
|
|
|
+ "min 1.000000\n",
|
|
|
+ "25% 1.000000\n",
|
|
|
+ "50% 2.000000\n",
|
|
|
+ "75% 6.000000\n",
|
|
|
+ "max 63034.000000"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "execution_count": 6,
|
|
|
+ "metadata": {},
|
|
|
+ "output_type": "execute_result"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "df.describe()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 7,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "大于1000的数量: 418\n",
|
|
|
+ "大于500的数量: 1035\n",
|
|
|
+ "大于400的数量: 1340\n",
|
|
|
+ "大于300的数量: 1883\n",
|
|
|
+ "大于250的数量: 2282\n",
|
|
|
+ "大于100的数量: 5104\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "print(\"大于1000的数量:\", df[df['count'] > 1000].count().key)\n",
|
|
|
+ "print(\"大于500的数量:\", df[df['count'] > 500].count().key)\n",
|
|
|
+ "print(\"大于400的数量:\", df[df['count'] > 400].count().key)\n",
|
|
|
+ "print(\"大于300的数量:\", df[df['count'] > 300].count().key)\n",
|
|
|
+ "print(\"大于250的数量:\", df[df['count'] > 250].count().key)\n",
|
|
|
+ "print(\"大于100的数量:\", df[df['count'] > 100].count().key)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 9,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 过滤词频小于300的数据\n",
|
|
|
+ "df=df[df['count']>=300]"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 10,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# 导出过滤后的数据,不要表头和行号,只要关键词列\n",
|
|
|
+ "df.to_csv(DEST_FILE_FILTER, header=False)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 16,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "df[5:7].to_csv(\"./data/多线城-3.csv\", header=False)"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "interpreter": {
|
|
|
+ "hash": "679ecc657d123b537eb7946f00483c298ba68f4074c79757b9e8823d90af42fb"
|
|
|
+ },
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3.9.0 ('jieba')",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.9.0"
|
|
|
+ },
|
|
|
+ "orig_nbformat": 4
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2
|
|
|
+}
|