Last active
June 13, 2021 01:13
-
-
Save bact/a001e870ec3fcc306cb8acbb2ba906bf to your computer and use it in GitHub Desktop.
Prepare text from Twitter archive JSON for Thai Common Voice Sentence Collector submission
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1) export text from JSON | |
import json | |
import pandas as pd | |
from pandas import json_normalize | |
# note: have to remove `window.YTD...` in the first line | |
data = json.load(open('bact-tweet.json')) | |
df = pd.DataFrame.from_records(json_normalize(data)) | |
df = df[df["tweet.lang"] == "th"] | |
df = df['tweet.full_text'] | |
df = df.str.strip() | |
with open("bact-tweet.txt", "w") as f: | |
_list = map(lambda x: x+"\n", df.tolist()) | |
f.writelines(_list) | |
#### | |
# 2) normalize text | |
import re | |
from pythainlp.util import normalize | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
"]+", flags=re.UNICODE) | |
with open("bact-tweet.txt", "r") as inputf, open("bact-tweet-normalized.txt", "w") as outputf: | |
for line in inputf: | |
line = re.sub(r"[\u200b\u200c\u2063\u200e]", "", line) # remove zero-width char | |
line = re.sub( | |
r"((https?|ftp):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)" | |
r"|(www\.([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)" | |
, " ", line) # remove url | |
line = re.sub(r"^\s*(((RT|rt)\s)?(@[\w\d]+:?\s+))+", " ", line) # remove reply and RT | |
line = re.sub(r"#", " ", line) # dehashtag | |
line = emoji_pattern.sub(" ", line) # remove emoji | |
line = re.sub(r"- -[\"']", " ", line) # remove - -" - -' | |
# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, above/below vowels at start of word | |
line = re.sub(r"(^|\s)[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", " ", line) | |
line = normalize(line) # normalize Thai char order | |
line = re.sub(r"([ก-ฮ])([ ]*\1){6,}", r"\1\1\1\1\1\1\1", line) # reduce duplicate consonants | |
line = re.sub(r"^\s*\[([a-zA-Z0-9])+\]\s+", "", line) # remove [xxx] at start of line | |
line = re.sub(r"[()\[\]]", "", line) # remove parentheses, brackets | |
line = re.sub(r"(\.{2,}|…)", "", line) # remove ellipsis | |
line = re.sub(r"^([^\"]+)?[\"]([^\"]+)?$", r"\1\2", line) # remove lone quotation mark | |
line = re.sub(r"([\s!?:;,-])([ ]*\1)+", r"\1", line) # remove duplicate symbol/space | |
line = re.sub(r"[,;:-]\s*$", "", line) # remove symbol at end of line | |
line = re.sub(r"(^\s+|\s+$)", "", line) # remove space at start/end of line | |
line = line.strip() | |
if len(line) > 0: | |
outputf.write(line) | |
outputf.write("\n") | |
#### | |
# 3) split long text to clauses, fix possible errors | |
import re | |
from pythainlp import sent_tokenize | |
with open("bact-tweet-normalized.txt", "r") as inputf, open("bact-tweet-clause.txt", "w") as outputf: | |
for line in inputf: | |
line = line.strip() | |
if len(line) < 2: | |
continue | |
if (len(line) <= 100) and (not re.search(r"[a-zA-Z0-9๐๑๒๓๔๕๖๗๘๙\u0e46\u0e3f$&()/\+=*@]", line)): | |
sentence = line | |
# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, Maiyamok, follow/above/below vowels at start of sentence | |
sentence = re.sub(r"^\s*[ะา\u0e45\u0e46\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", "", sentence) | |
# remove lead vowel at end of sentence | |
sentence = re.sub(r"[เแโไใ]+\s*$", "", sentence) | |
sentence = re.sub(r"[,;:-]\s*$", "", sentence) # remove symbol at end of line | |
sentence = re.sub(r"\s+", " ", sentence) | |
sentence = sentence.strip() | |
outputf.write(line) | |
outputf.write("\n") | |
else: | |
for sentence in sent_tokenize(line): | |
sentence = re.sub(r"^([^\"]+)?[\"]([^\"]+)?$", r"\1\2", sentence) # remove lone quotation mark | |
# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, Maiyamok, follow/above/below vowels at start of sentence | |
sentence = re.sub(r"^\s*[ะา\u0e45\u0e46\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", "", sentence) | |
# remove lead vowel at end of sentence | |
sentence = re.sub(r"[เแโไใ]+\s*$", "", sentence) | |
sentence = re.sub(r"[,;:-]\s*$", "", sentence) # remove symbol at end of line | |
sentence = re.sub(r"\s+", " ", sentence) | |
sentence = sentence.strip() | |
if len(sentence) >= 2 and len(sentence) <= 100: | |
if not re.search(r"[a-zA-Z0-9๐๑๒๓๔๕๖๗๘๙\u0e46\u0e3f$&()/\+=*@]", sentence): # remove sentence with invalid char | |
outputf.write(sentence) | |
outputf.write("\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment