Created
January 3, 2016 17:34
-
-
Save waf/c1cf7b9016d2b2313d7b to your computer and use it in GitHub Desktop.
Python 2 script that parses Stuart Jay Raj's "First Words to Learn - Enhanced Wordlist from Fluency Forever Multi Lang"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Python 2 script that parses Stuart Jay Raj's "First Words to Learn - Enhanced Wordlist from Fluency Forever Multi Lang" | |
# Input file should be the JSON export here: https://spreadsheets.google.com/feeds/list/1GsNe8GVzgIuOeeEVBCcoIvxVMDO5xnEgGRr8EKtcSGI/1/public/basic?alt=json | |
# See original spreadsheet here: https://docs.google.com/spreadsheets/d/1GsNe8GVzgIuOeeEVBCcoIvxVMDO5xnEgGRr8EKtcSGI/edit#gid=884321509 | |
import json | |
import codecs | |
QUESTION_LANGUAGE = 'thaith' | |
ANSWER_LANGUAGE = 'englishen' | |
INPUT_FILENAME = 'basic.json' | |
with codecs.open(INPUT_FILENAME, 'r', encoding='utf8') as json_file: | |
data = json.loads(json_file.read()) | |
entries = data['feed']['entry'] | |
parsed = dict() # this will hold the json that we write to file | |
words = list() # this will be all the words in a single category | |
for entry in entries: | |
category = entry['title']['$t'] | |
content = entry['content']['$t'] | |
if not category.startswith('Row: '): | |
words = list() | |
parsed[category] = words | |
# each content row is one mega-string that needs to be split into useful data. | |
# transform it to a dictionary. | |
translations = dict() | |
for translation in content.split(", "): | |
language_pair = translation.split(": ") | |
if len(language_pair) == 2: | |
translations[language_pair[0]] = language_pair[1] | |
jsonobj = dict() | |
jsonobj['question'] = translations[QUESTION_LANGUAGE] | |
jsonobj['answer'] = translations[ANSWER_LANGUAGE] | |
words.append(jsonobj) | |
output_filename = QUESTION_LANGUAGE + ".json" | |
with codecs.open(output_filename, 'w', encoding='utf8') as output_file: | |
json.dump(parsed, output_file, indent=4, ensure_ascii=False) | |
print("Created " + output_filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment