Skip to content

Instantly share code, notes, and snippets.

@soldni
Created November 11, 2021 22:26
Show Gist options
  • Save soldni/47fb3d67795eedc8bbc49438436462d0 to your computer and use it in GitHub Desktop.
Save soldni/47fb3d67795eedc8bbc49438436462d0 to your computer and use it in GitHub Desktop.
import csv
import json
import pymysql.cursors
import itertools
import tqdm
import re
# Connect to the database
connection = pymysql.connect(host='localhost',
user='ubuntu',
password='password',
database='2020.1',
autocommit=True,
cursorclass=pymysql.cursors.SSDictCursor)
mapping = {
"Target_Child": ["Target_Child"],
"Caregiver": ["Mother", "Father", "Grandmother", "Caretaker", "Grandfather"],
"Other_Child": ["Child", "Sister", "Brother", "Playmate", "Friend", "Student", "Sibling"],
"Other_Adult": ["Investigator", "Adult", "Teacher", "Nurse", "Visitor", "Target_Adult", "Doctor"],
"Other": ["Relative", "Participant", "Unidentified", "Subject", "Informant",
"Media", "Narrator", "Girl", "Group", "Teenager", "Environment",
"Boy", "Speaker", "Uncertain", "Male"]
}
inv_mapping = {}
for k, v in mapping.items():
for elem in v:
inv_mapping[elem] = k
all_data = []
with connection, connection.cursor() as cursor:
sql_count = "SELECT COUNT(id) FROM utterance;"
cursor.execute(sql_count)
count = cursor.fetchall()
count = int(count[0]['COUNT(id)'])
sql_rows = "SELECT * FROM utterance GROUP BY corpus_id, transcript_id, utterance_order, id"
cursor.execute(sql_rows)
current_transcript = {}
for row in tqdm.tqdm(cursor, desc='Reading utterances', unit=' utt', unit_scale=True, total=count):
# row = dict(zip(columns, row))
if row['language'] != 'eng':
continue
if (current_transcript.get('corpus_id', None) != row['corpus_id'] or
current_transcript.get('transcript_id', None) != row['transcript_id']):
if current_transcript:
all_data.append(current_transcript)
current_transcript = {}
current_transcript.setdefault('utterances', [])
for key in ('collection_name', 'corpus_id', 'transcript_id', 'target_child_name', 'target_child_age',
'target_child_sex', 'target_child_id', 'corpus_name', 'language'):
current_transcript[key] = row[key]
current_transcript['utterances'].append({
'id': row['id'],
'gloss': row['gloss'],
'type': row['type'],
'part_of_speech': row['part_of_speech'],
'speaker_name': row['speaker_name'],
'speaker_role': inv_mapping.get(row['speaker_role'], 'Other')
})
if len(current_transcript):
all_data.append(current_transcript)
with open('/home/ubuntu/childes/childes.eng.jsonl', 'w', encoding='utf-8') as out_file:
for doc in tqdm.tqdm(all_data, desc='Writing to file', unit=' docs', unit_scale=True):
out_file.write(json.dumps(doc, sort_keys=True) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment