Skip to content

Instantly share code, notes, and snippets.

@tony
Created January 23, 2014 23:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tony/8588998 to your computer and use it in GitHub Desktop.
Save tony/8588998 to your computer and use it in GitHub Desktop.
Scrape Chinese Radical information from HTML tables on the internet using requests and pandas.
#!/usr/bin/env python
# -*- coding: utf8 -*-
"""Pull Chinese Radical information from HTML tables on the internet.
:license: MIT License
:author: Tony Narlock
Requirements:
- lxml, html5lib, beautifulsoup4
- pandas
- requests
"""
from __future__ import absolute_import, division, print_function, \
with_statement, unicode_literals
import os
import pandas
import requests
from bs4 import BeautifulSoup
def radicals_archchinese():
"""Return pandas DataFrame for archchinese.com radicals."""
url = 'http://www.archchinese.com/arch_chinese_radicals.html'
r = requests.get(url)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text)
table = soup.find("table", class_="radicaltable").prettify()
df = pandas.io.html.read_html(
io=table,
skiprows=2,
index_col=0
)[0]
return df
def radicals_yellowbridge():
"""Return pandas DataFrame from yellowbridge.com radicals."""
url = 'http://www.yellowbridge.com/chinese/radicals.php'
r = requests.get(url)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text)
table = soup.find("table", class_="sortable").prettify()
df = pandas.io.html.read_html(
io=table,
index_col=[0, 1, 2, 3, 4, 5],
header=0
)[0]
return df
df = radicals_archchinese()
df = radicals_yellowbridge()
print(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment