Skip to content

Instantly share code, notes, and snippets.

@incidunt
Last active March 31, 2019 10:55
Show Gist options
  • Save incidunt/8a2cf96c20b06186defa7225219b21b3 to your computer and use it in GitHub Desktop.
Save incidunt/8a2cf96c20b06186defa7225219b21b3 to your computer and use it in GitHub Desktop.
用pandas过滤出WordPress精品插件.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# 首先引入所有需要用的库\n",
"\n",
"#读取jsonl文件的库\n",
"import jsonlines\n",
"\n",
"# 数据分析的库\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"\n",
"import maya\n",
"import json\n",
"\n",
"import requests\n",
"\n",
"import functools\n",
"\n",
"import html\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"plugins=[]\n",
"\n",
"keepkeys=[\n",
" \"slug\",\n",
" \"name\",\n",
" \"author\",\n",
" \"downloaded\",\n",
" \"rating\",\n",
" \"num_ratings\",\n",
" \"added\",\n",
" \"last_updated\",\n",
" \"tested\",\n",
" \"support_threads\",\n",
" \"support_threads_resolved\"]\n",
"\n",
"# output.jsonl 由爬虫得来,详见 :\n",
"# 用Python爬取WordPress官网所有插件\n",
"# https://bestscreenshot.com/scrap-all-plugins-from-wordpress-org/\n",
"\n",
"with jsonlines.open('../output.jsonl') as reader:\n",
" for obj in reader: \n",
" for k in list(obj.keys()):\n",
" if k not in keepkeys:\n",
" del obj[k]\n",
" plugins.append(obj)\n",
" \n",
"df = pd.DataFrame(plugins)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# 把name里的html转义符做一下反转义\n",
"df['name']=df['name'].apply(lambda x: html.unescape(x))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# 根据当前日期计算出距离上次更新的天数\n",
"df['last_updated_days']=df['last_updated'].apply(lambda x: (maya.now() - maya.when(x) ).days ) "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# 返回符合所有条件为真的数据\n",
"def conjunction(*conditions):\n",
" return functools.reduce(np.logical_and, conditions)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# 所有的过滤的条件\n",
"last_updated_days_bool = df['last_updated_days'] <= 365\n",
"\n",
"downloaded_bool = df['downloaded'] > 1000\n",
"\n",
"rating_bool = df['rating'] >= 90 \n",
"\n",
"num_ratings_bool = df['num_ratings'] >= 100 \n",
"\n",
"support_threads_bool = df['support_threads'] > 0 \n",
"\n",
"support_threads_resolved_bool = df[\"support_threads_resolved\"] > 0 \n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data_filtered = df[conjunction(\n",
" last_updated_days_bool,\n",
" downloaded_bool,\n",
" rating_bool,\n",
" num_ratings_bool,\n",
" support_threads_bool,\n",
" support_threads_resolved_bool,\n",
" )]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"added 312\n",
"author 312\n",
"downloaded 312\n",
"last_updated 312\n",
"name 312\n",
"num_ratings 312\n",
"rating 312\n",
"slug 312\n",
"support_threads 312\n",
"support_threads_resolved 312\n",
"tested 312\n",
"last_updated_days 312\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_filtered.count()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# 把过滤结果转换成字典\n",
"data_filtered_dict=data_filtered.set_index('slug').T.to_dict()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# 把字典转换成列表\n",
"data_filtered_list=[]\n",
"for k,v in data_filtered_dict.items():\n",
" x={'slug':k}\n",
" y=v\n",
" z = {**x, **y}\n",
" data_filtered_list.append(z)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'slug': 'accelerated-mobile-pages',\n",
" 'added': '2016-02-07',\n",
" 'author': '<a href=\"https://ampforwp.com/\">Ahmed Kaludi, Mohammed Kaludi</a>',\n",
" 'downloaded': 4137343,\n",
" 'last_updated': '2019-02-28 12:52pm GMT',\n",
" 'name': 'AMP for WP – Accelerated Mobile Pages',\n",
" 'num_ratings': 780,\n",
" 'rating': 90,\n",
" 'support_threads': 286,\n",
" 'support_threads_resolved': 60,\n",
" 'tested': '5.1',\n",
" 'last_updated_days': 30}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_filtered_list[1]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# 把结果列表存为json文件\n",
"json.dump(data_filtered_list, open(\"data_filtered_result.json\",\"w\"))\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment