Skip to content

Instantly share code, notes, and snippets.

@waf
Created January 3, 2016 17:34
Show Gist options
  • Save waf/c1cf7b9016d2b2313d7b to your computer and use it in GitHub Desktop.
Save waf/c1cf7b9016d2b2313d7b to your computer and use it in GitHub Desktop.
Python 2 script that parses Stuart Jay Raj's "First Words to Learn - Enhanced Wordlist from Fluency Forever Multi Lang"
#!/usr/bin/env python
# Python 2 script that parses Stuart Jay Raj's "First Words to Learn - Enhanced Wordlist from Fluency Forever Multi Lang"
# Input file should be the JSON export here: https://spreadsheets.google.com/feeds/list/1GsNe8GVzgIuOeeEVBCcoIvxVMDO5xnEgGRr8EKtcSGI/1/public/basic?alt=json
# See original spreadsheet here: https://docs.google.com/spreadsheets/d/1GsNe8GVzgIuOeeEVBCcoIvxVMDO5xnEgGRr8EKtcSGI/edit#gid=884321509
import json
import codecs
QUESTION_LANGUAGE = 'thaith'
ANSWER_LANGUAGE = 'englishen'
INPUT_FILENAME = 'basic.json'
with codecs.open(INPUT_FILENAME, 'r', encoding='utf8') as json_file:
data = json.loads(json_file.read())
entries = data['feed']['entry']
parsed = dict() # this will hold the json that we write to file
words = list() # this will be all the words in a single category
for entry in entries:
category = entry['title']['$t']
content = entry['content']['$t']
if not category.startswith('Row: '):
words = list()
parsed[category] = words
# each content row is one mega-string that needs to be split into useful data.
# transform it to a dictionary.
translations = dict()
for translation in content.split(", "):
language_pair = translation.split(": ")
if len(language_pair) == 2:
translations[language_pair[0]] = language_pair[1]
jsonobj = dict()
jsonobj['question'] = translations[QUESTION_LANGUAGE]
jsonobj['answer'] = translations[ANSWER_LANGUAGE]
words.append(jsonobj)
output_filename = QUESTION_LANGUAGE + ".json"
with codecs.open(output_filename, 'w', encoding='utf8') as output_file:
json.dump(parsed, output_file, indent=4, ensure_ascii=False)
print("Created " + output_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment