Last active
September 15, 2017 23:46
-
-
Save samm81/9909de5476461ae0855bde88b848d8b2 to your computer and use it in GitHub Desktop.
Simple script to create an anki importable csv for new chinese flash cards.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from lxml import html | |
import requests | |
import sys | |
import readline | |
# curl 'http://www.yellowbridge.com/chinese/dictionary.php' -H 'Cookie: PHPSESSID=qijdue2gmrhs4pltdphjcqj3o2; _ga=GA1.2.112204757.1484656688; _gid=GA1.2.125872261.1495576784' -H 'Origin: http://www.yellowbridge.com' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Cache-Control: max-age=0' -H 'Referer: http://www.yellowbridge.com/chinese/dictionary.php' -H 'Connection: keep-alive' -H 'DNT: 1' --data 'searchMode=P&word=shen2me' --compressed | |
headers = { | |
'Cookie': 'PHPSESSID=qijdue2gmrhs4pltdphjcqj3o2; _ga=GA1.2.112204757.1484656688; _gid=GA1.2.125872261.1495576784', | |
'Origin': 'http://www.yellowbridge.com', | |
#'Accept-Encoding': 'gzip, deflate', | |
'Accept-Language': 'en-US,en;q=0.8', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Cache-Control': 'max-age=0', | |
'Referer': 'http://www.yellowbridge.com/chinese/dictionary.php', | |
'Connection': 'keep-alive', | |
'DNT': '1' | |
} | |
def ask_ok(): | |
prompt = 'does this look ok ([y]/n) ? ' | |
def valid(i): | |
if i in ['', 'y', 'Y']: | |
return True | |
elif i in ['n', 'N']: | |
return False | |
else: | |
return valid(raw_input('invalid input.\n{}'.format(prompt))) | |
good = valid(raw_input(prompt)) | |
if not good: | |
print 'ok. exiting' | |
sys.exit(1) | |
def search_yellowbridge(pinging): | |
"""Get the results of a pinging search from yellowbridge""" | |
page = requests.get('https://www.yellowbridge.com/chinese/dictionary.php?searchMode=P&word={}'.format(pinging), headers=headers) | |
tree = html.fromstring(page.content) | |
main_div = tree.xpath('//div[@id="tabBody"]')[0] | |
return main_div | |
def choose_search_result(main_div): | |
results = main_div.xpath('//table[@id="multiRow"]/tr') | |
def process_tr(tr): | |
char = tr.xpath('td[1]/a/text()')[0] | |
url = tr.xpath('td[1]/a/@href')[0] | |
url = "https://www.yellowbridge.com" + url | |
#pinging = tr.xpath('td[2]/text()')[0] | |
pinging = tr.xpath('td[2]')[0].text_content() | |
#definition = tr.xpath('td[3]/text()')[0] | |
definition = tr.xpath('td[3]')[0].text_content() | |
return (char, pinging, definition, url) | |
return [ process_tr(tr) for tr in results ] | |
def pick_from_results(results): | |
"""Prompts user to chose the correct character corresponding to the pinging. Returns the URL of the character.""" | |
print 'found the following matches:' | |
for i, result in enumerate(results): | |
print u'{}\t{}\t{}\t{}'.format(i, result[0], result[1], result[2]) | |
prompt = 'which word [0-{}] ? '.format(len(results) - 1) | |
def valid(i): | |
if i.isdigit() and int(i) >= 0 and int(i) < len(results): | |
return int(i) | |
else: | |
return valid(raw_input('invalid input.\n{}'.format(prompt))) | |
choice = valid(raw_input(prompt)) | |
return results[choice][3] # url of choice | |
def retrieve_page(url): | |
page = requests.get(url, headers=headers) | |
tree = html.fromstring(page.content) | |
return tree | |
def get_word_data(tree): | |
"""Gets the data that will be put into the flashcard from the yellowbridge url.""" | |
# need to extract english, pinging, effective pinging, zhuyin, character, traditional | |
rows = tree.xpath('//table[@id="mainData"]/tr') | |
english = pinging = ef_pinging = zhuyin = char = trad = '' | |
for row in rows: | |
title = row.xpath('td[1]/text()')[0] | |
#data = row.xpath('td[2]//text()')[0] | |
data = row.xpath('td[2]')[0].text_content() | |
if title == 'English Definition': | |
if u'粵' in data: | |
data = data.split(u'(粵)')[0] | |
english = data | |
elif title == 'Simplified Script': | |
char = data | |
elif title == 'Traditional Script': | |
trad = '' if 'Same' in data else data | |
elif title == 'Pinyin': | |
if 'also' in data or 'pr.' in data: | |
data = data.split(' ,')[0] | |
pinging = data | |
elif title == 'Effective Pinyin ': | |
ef_pinging = '' if 'Same' in data else data | |
elif title == 'Zhuyin (Bopomofo)': | |
zhuyin = data | |
extra_definition = '' | |
return (english, extra_definition, pinging, ef_pinging, zhuyin, char, trad) | |
def add_word(cli_tags, ofile): | |
pinging = raw_input('pinging for word: ') | |
print 'searching for {}...'.format(pinging) | |
main_div = search_yellowbridge(pinging) | |
word_data = None | |
if main_div.xpath('//table[@id="multiRow"]/tr'): # then we got results | |
results = choose_search_result(main_div) | |
url = pick_from_results(results) | |
print 'ok. retreiving yellowbridge page...' | |
tree = retrieve_page(url) | |
word_data = get_word_data(tree) | |
elif main_div.xpath('//table[@id="mainData"]/tr'): # then we got only one result | |
print 'ok. found only one result. retrieving page...' | |
word_data = get_word_data(main_div) | |
else: # got nothing | |
print 'could not find a match' | |
print 'exiting' | |
sys.exit(1) | |
print 'retrieved!' | |
tags = ask_tags() if not cli_tags else cli_tags | |
print 'ok. creating the following note:' | |
print 'english: ' + word_data[0] | |
print 'pinging: ' + word_data[1] | |
print 'effective pinging: ' + word_data[2] | |
print 'zhuyin: ' + word_data[3] | |
print 'character: ' + word_data[4] | |
print 'traditional character: ' + word_data[5] | |
tags_str = ' {} '.format(' '.join(tags)) if len(tags) > 0 else '' | |
print 'tags: \'{}\''.format(tags_str) | |
ask_ok() | |
print 'ok. creating note...' | |
ostring = '' | |
if tags: | |
ostring = u'{}\t{}'.format(u'\t'.join(word_data), u' '.join(tags)) | |
else: | |
ostring = u'\t'.join(word_data) | |
ofile.write(ostring.encode('utf8')) | |
ofile.write('\n') | |
def ask_continue(): | |
prompt = 'do you want to add another? ([y]/n) ? ' | |
def valid(i): | |
if i in ['', 'y', 'Y']: | |
return True | |
elif i in ['n', 'N']: | |
return False | |
else: | |
return valid(raw_input('invalid input.\n{}'.format(prompt))) | |
return valid(raw_input(prompt)) | |
if __name__ == '__main__': | |
cli_tags = [] | |
if len(sys.argv) > 1: | |
print 'found command line tag' | |
for i in range(1, len(sys.argv)): | |
cli_tags.append(str(sys.argv[i].replace(',',''))) | |
print 'using {} as tags'.format(cli_tags) | |
ofile = open('new_cards_{}.csv'.format(cli_tags[0]), 'w', 0) | |
#ofile.write('definition\textra definition\tpinging\teffective pinging\tzhuyin\tcharacter\ttraditional character\n') | |
add_word(cli_tags, ofile) | |
while ask_continue(): | |
add_word(cli_tags, ofile) | |
ofile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment