Created
January 23, 2014 23:30
-
-
Save tony/8588998 to your computer and use it in GitHub Desktop.
Scrape Chinese Radical information from HTML tables on the internet using requests and pandas.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf8 -*- | |
"""Pull Chinese Radical information from HTML tables on the internet. | |
:license: MIT License | |
:author: Tony Narlock | |
Requirements: | |
- lxml, html5lib, beautifulsoup4 | |
- pandas | |
- requests | |
""" | |
from __future__ import absolute_import, division, print_function, \ | |
with_statement, unicode_literals | |
import os | |
import pandas | |
import requests | |
from bs4 import BeautifulSoup | |
def radicals_archchinese(): | |
"""Return pandas DataFrame for archchinese.com radicals.""" | |
url = 'http://www.archchinese.com/arch_chinese_radicals.html' | |
r = requests.get(url) | |
r.encoding = 'utf-8' | |
soup = BeautifulSoup(r.text) | |
table = soup.find("table", class_="radicaltable").prettify() | |
df = pandas.io.html.read_html( | |
io=table, | |
skiprows=2, | |
index_col=0 | |
)[0] | |
return df | |
def radicals_yellowbridge(): | |
"""Return pandas DataFrame from yellowbridge.com radicals.""" | |
url = 'http://www.yellowbridge.com/chinese/radicals.php' | |
r = requests.get(url) | |
r.encoding = 'utf-8' | |
soup = BeautifulSoup(r.text) | |
table = soup.find("table", class_="sortable").prettify() | |
df = pandas.io.html.read_html( | |
io=table, | |
index_col=[0, 1, 2, 3, 4, 5], | |
header=0 | |
)[0] | |
return df | |
df = radicals_archchinese() | |
df = radicals_yellowbridge() | |
print(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment