Skip to content

Instantly share code, notes, and snippets.

@bact
Last active June 13, 2021 01:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bact/a001e870ec3fcc306cb8acbb2ba906bf to your computer and use it in GitHub Desktop.
Save bact/a001e870ec3fcc306cb8acbb2ba906bf to your computer and use it in GitHub Desktop.
Prepare text from Twitter archive JSON for Thai Common Voice Sentence Collector submission
# 1) export text from JSON
import json
import pandas as pd
from pandas import json_normalize
# note: have to remove `window.YTD...` in the first line
data = json.load(open('bact-tweet.json'))
df = pd.DataFrame.from_records(json_normalize(data))
df = df[df["tweet.lang"] == "th"]
df = df['tweet.full_text']
df = df.str.strip()
with open("bact-tweet.txt", "w") as f:
_list = map(lambda x: x+"\n", df.tolist())
f.writelines(_list)
####
# 2) normalize text
import re
from pythainlp.util import normalize
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
with open("bact-tweet.txt", "r") as inputf, open("bact-tweet-normalized.txt", "w") as outputf:
for line in inputf:
line = re.sub(r"[\u200b\u200c\u2063\u200e]", "", line) # remove zero-width char
line = re.sub(
r"((https?|ftp):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)"
r"|(www\.([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)"
, " ", line) # remove url
line = re.sub(r"^\s*(((RT|rt)\s)?(@[\w\d]+:?\s+))+", " ", line) # remove reply and RT
line = re.sub(r"#", " ", line) # dehashtag
line = emoji_pattern.sub(" ", line) # remove emoji
line = re.sub(r"- -[\"']", " ", line) # remove - -" - -'
# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, above/below vowels at start of word
line = re.sub(r"(^|\s)[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", " ", line)
line = normalize(line) # normalize Thai char order
line = re.sub(r"([ก-ฮ])([ ]*\1){6,}", r"\1\1\1\1\1\1\1", line) # reduce duplicate consonants
line = re.sub(r"^\s*\[([a-zA-Z0-9])+\]\s+", "", line) # remove [xxx] at start of line
line = re.sub(r"[()\[\]]", "", line) # remove parentheses, brackets
line = re.sub(r"(\.{2,}|…)", "", line) # remove ellipsis
line = re.sub(r"^([^\"]+)?[\"]([^\"]+)?$", r"\1\2", line) # remove lone quotation mark
line = re.sub(r"([\s!?:;,-])([ ]*\1)+", r"\1", line) # remove duplicate symbol/space
line = re.sub(r"[,;:-]\s*$", "", line) # remove symbol at end of line
line = re.sub(r"(^\s+|\s+$)", "", line) # remove space at start/end of line
line = line.strip()
if len(line) > 0:
outputf.write(line)
outputf.write("\n")
####
# 3) split long text to clauses, fix possible errors
import re
from pythainlp import sent_tokenize
with open("bact-tweet-normalized.txt", "r") as inputf, open("bact-tweet-clause.txt", "w") as outputf:
for line in inputf:
line = line.strip()
if len(line) < 2:
continue
if (len(line) <= 100) and (not re.search(r"[a-zA-Z0-9๐๑๒๓๔๕๖๗๘๙\u0e46\u0e3f$&()/\+=*@]", line)):
sentence = line
# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, Maiyamok, follow/above/below vowels at start of sentence
sentence = re.sub(r"^\s*[ะา\u0e45\u0e46\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", "", sentence)
# remove lead vowel at end of sentence
sentence = re.sub(r"[เแโไใ]+\s*$", "", sentence)
sentence = re.sub(r"[,;:-]\s*$", "", sentence) # remove symbol at end of line
sentence = re.sub(r"\s+", " ", sentence)
sentence = sentence.strip()
outputf.write(line)
outputf.write("\n")
else:
for sentence in sent_tokenize(line):
sentence = re.sub(r"^([^\"]+)?[\"]([^\"]+)?$", r"\1\2", sentence) # remove lone quotation mark
# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, Maiyamok, follow/above/below vowels at start of sentence
sentence = re.sub(r"^\s*[ะา\u0e45\u0e46\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", "", sentence)
# remove lead vowel at end of sentence
sentence = re.sub(r"[เแโไใ]+\s*$", "", sentence)
sentence = re.sub(r"[,;:-]\s*$", "", sentence) # remove symbol at end of line
sentence = re.sub(r"\s+", " ", sentence)
sentence = sentence.strip()
if len(sentence) >= 2 and len(sentence) <= 100:
if not re.search(r"[a-zA-Z0-9๐๑๒๓๔๕๖๗๘๙\u0e46\u0e3f$&()/\+=*@]", sentence): # remove sentence with invalid char
outputf.write(sentence)
outputf.write("\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment