Skip to content

Instantly share code, notes, and snippets.

@samm81
Last active September 15, 2017 23:46
Show Gist options
  • Save samm81/9909de5476461ae0855bde88b848d8b2 to your computer and use it in GitHub Desktop.
Save samm81/9909de5476461ae0855bde88b848d8b2 to your computer and use it in GitHub Desktop.
Simple script to create an anki importable csv for new chinese flash cards.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml import html
import requests
import sys
import readline
# curl 'http://www.yellowbridge.com/chinese/dictionary.php' -H 'Cookie: PHPSESSID=qijdue2gmrhs4pltdphjcqj3o2; _ga=GA1.2.112204757.1484656688; _gid=GA1.2.125872261.1495576784' -H 'Origin: http://www.yellowbridge.com' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Cache-Control: max-age=0' -H 'Referer: http://www.yellowbridge.com/chinese/dictionary.php' -H 'Connection: keep-alive' -H 'DNT: 1' --data 'searchMode=P&word=shen2me' --compressed
headers = {
'Cookie': 'PHPSESSID=qijdue2gmrhs4pltdphjcqj3o2; _ga=GA1.2.112204757.1484656688; _gid=GA1.2.125872261.1495576784',
'Origin': 'http://www.yellowbridge.com',
#'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Referer': 'http://www.yellowbridge.com/chinese/dictionary.php',
'Connection': 'keep-alive',
'DNT': '1'
}
def ask_ok():
prompt = 'does this look ok ([y]/n) ? '
def valid(i):
if i in ['', 'y', 'Y']:
return True
elif i in ['n', 'N']:
return False
else:
return valid(raw_input('invalid input.\n{}'.format(prompt)))
good = valid(raw_input(prompt))
if not good:
print 'ok. exiting'
sys.exit(1)
def search_yellowbridge(pinging):
"""Get the results of a pinging search from yellowbridge"""
page = requests.get('https://www.yellowbridge.com/chinese/dictionary.php?searchMode=P&word={}'.format(pinging), headers=headers)
tree = html.fromstring(page.content)
main_div = tree.xpath('//div[@id="tabBody"]')[0]
return main_div
def choose_search_result(main_div):
results = main_div.xpath('//table[@id="multiRow"]/tr')
def process_tr(tr):
char = tr.xpath('td[1]/a/text()')[0]
url = tr.xpath('td[1]/a/@href')[0]
url = "https://www.yellowbridge.com" + url
#pinging = tr.xpath('td[2]/text()')[0]
pinging = tr.xpath('td[2]')[0].text_content()
#definition = tr.xpath('td[3]/text()')[0]
definition = tr.xpath('td[3]')[0].text_content()
return (char, pinging, definition, url)
return [ process_tr(tr) for tr in results ]
def pick_from_results(results):
"""Prompts user to chose the correct character corresponding to the pinging. Returns the URL of the character."""
print 'found the following matches:'
for i, result in enumerate(results):
print u'{}\t{}\t{}\t{}'.format(i, result[0], result[1], result[2])
prompt = 'which word [0-{}] ? '.format(len(results) - 1)
def valid(i):
if i.isdigit() and int(i) >= 0 and int(i) < len(results):
return int(i)
else:
return valid(raw_input('invalid input.\n{}'.format(prompt)))
choice = valid(raw_input(prompt))
return results[choice][3] # url of choice
def retrieve_page(url):
page = requests.get(url, headers=headers)
tree = html.fromstring(page.content)
return tree
def get_word_data(tree):
"""Gets the data that will be put into the flashcard from the yellowbridge url."""
# need to extract english, pinging, effective pinging, zhuyin, character, traditional
rows = tree.xpath('//table[@id="mainData"]/tr')
english = pinging = ef_pinging = zhuyin = char = trad = ''
for row in rows:
title = row.xpath('td[1]/text()')[0]
#data = row.xpath('td[2]//text()')[0]
data = row.xpath('td[2]')[0].text_content()
if title == 'English Definition':
if u'粵' in data:
data = data.split(u'(粵)')[0]
english = data
elif title == 'Simplified Script':
char = data
elif title == 'Traditional Script':
trad = '' if 'Same' in data else data
elif title == 'Pinyin':
if 'also' in data or 'pr.' in data:
data = data.split(' ,')[0]
pinging = data
elif title == 'Effective Pinyin ':
ef_pinging = '' if 'Same' in data else data
elif title == 'Zhuyin (Bopomofo)':
zhuyin = data
extra_definition = ''
return (english, extra_definition, pinging, ef_pinging, zhuyin, char, trad)
def add_word(cli_tags, ofile):
pinging = raw_input('pinging for word: ')
print 'searching for {}...'.format(pinging)
main_div = search_yellowbridge(pinging)
word_data = None
if main_div.xpath('//table[@id="multiRow"]/tr'): # then we got results
results = choose_search_result(main_div)
url = pick_from_results(results)
print 'ok. retreiving yellowbridge page...'
tree = retrieve_page(url)
word_data = get_word_data(tree)
elif main_div.xpath('//table[@id="mainData"]/tr'): # then we got only one result
print 'ok. found only one result. retrieving page...'
word_data = get_word_data(main_div)
else: # got nothing
print 'could not find a match'
print 'exiting'
sys.exit(1)
print 'retrieved!'
tags = ask_tags() if not cli_tags else cli_tags
print
print 'ok. creating the following note:'
print 'english: ' + word_data[0]
print 'pinging: ' + word_data[1]
print 'effective pinging: ' + word_data[2]
print 'zhuyin: ' + word_data[3]
print 'character: ' + word_data[4]
print 'traditional character: ' + word_data[5]
tags_str = ' {} '.format(' '.join(tags)) if len(tags) > 0 else ''
print 'tags: \'{}\''.format(tags_str)
ask_ok()
print 'ok. creating note...'
print
ostring = ''
if tags:
ostring = u'{}\t{}'.format(u'\t'.join(word_data), u' '.join(tags))
else:
ostring = u'\t'.join(word_data)
ofile.write(ostring.encode('utf8'))
ofile.write('\n')
def ask_continue():
prompt = 'do you want to add another? ([y]/n) ? '
def valid(i):
if i in ['', 'y', 'Y']:
return True
elif i in ['n', 'N']:
return False
else:
return valid(raw_input('invalid input.\n{}'.format(prompt)))
return valid(raw_input(prompt))
if __name__ == '__main__':
cli_tags = []
if len(sys.argv) > 1:
print 'found command line tag'
for i in range(1, len(sys.argv)):
cli_tags.append(str(sys.argv[i].replace(',','')))
print 'using {} as tags'.format(cli_tags)
ofile = open('new_cards_{}.csv'.format(cli_tags[0]), 'w', 0)
#ofile.write('definition\textra definition\tpinging\teffective pinging\tzhuyin\tcharacter\ttraditional character\n')
add_word(cli_tags, ofile)
while ask_continue():
add_word(cli_tags, ofile)
ofile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment