Last active
June 10, 2019 04:42
-
-
Save nsh1l/c941ebe63827b8c52d176970d0756628 to your computer and use it in GitHub Desktop.
youmaniwa-blog_061019
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 記事の連続取得" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"前回のデータが残っていたら削除" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"\n", | |
"if os.path.exists('raw_entry-contents.txt'):\n", | |
" print(\"以前実行したファイルが残っています\")\n", | |
" q_delete = input('削除しますか?(y/n) >> ')\n", | |
" if q_delete == 'y':\n", | |
" os.remove('raw_entry-contents.txt')\n", | |
" os.remove('prcsd_entry-contents.txt')\n", | |
"else:\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"HTTPリクエストを送りHTMLを抽出" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"import re\n", | |
"\n", | |
"raw_urls = []\n", | |
"urls = []\n", | |
"with open('archive_url.txt','r') as f_in:\n", | |
" for url in f_in.readlines():\n", | |
" raw_urls.append(url)\n", | |
"\n", | |
"for i in raw_urls:\n", | |
" urls.append(i.replace('\\n',''))\n", | |
"\n", | |
"for url in urls:\n", | |
" r = requests.get(url)\n", | |
" r_text = r.content\n", | |
" \n", | |
" if r.status_code == 200:\n", | |
" soup = BeautifulSoup(r_text, 'lxml')\n", | |
" soup_entry = soup.find_all(class_='entry-content')\n", | |
" \n", | |
" with open('raw_entry-contents.txt','a', encoding = 'utf-8') as f_out:\n", | |
" for t in soup_entry:\n", | |
" f_out.write('{}\\n\\n---\\n\\n'.format(str(t.get_text)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"HTMLタグの除去" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"\n", | |
"with open('raw_entry-contents.txt','r', encoding = 'utf-8') as cs_in:\n", | |
" raw_content = cs_in.read()\n", | |
"\n", | |
"with open('prcsd_entry-contents.txt','a', encoding = 'utf-8') as cs_out:\n", | |
" cs_out.write((re.sub('<.+?>',' ',raw_content)))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment