bact/tweet-archive-to-common-voice.py

## tweet-archive-to-common-voice.py
# 1) export text from JSON

import json
import pandas as pd
from pandas import json_normalize

# note: have to remove `window.YTD...` in the first line
data = json.load(open('bact-tweet.json'))
df = pd.DataFrame.from_records(json_normalize(data))
df = df[df["tweet.lang"] == "th"]
df = df['tweet.full_text']
df = df.str.strip()

with open("bact-tweet.txt", "w") as f:
    _list = map(lambda x: x+"\n", df.tolist())
    f.writelines(_list)

####

# 2) normalize text

import re
from pythainlp.util import normalize

emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

with open("bact-tweet.txt", "r") as inputf, open("bact-tweet-normalized.txt", "w") as outputf:
    for line in inputf:
        line = re.sub(r"[\u200b\u200c\u2063\u200e]", "", line)  # remove zero-width char
        line = re.sub(
            r"((https?|ftp):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)"
            r"|(www\.([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)"
            , " ", line)  # remove url
        line = re.sub(r"^\s*(((RT|rt)\s)?(@[\w\d]+:?\s+))+", " ", line)  # remove reply and RT
        line = re.sub(r"#", " ", line)  # dehashtag
        line = emoji_pattern.sub(" ", line)  # remove emoji
        line = re.sub(r"- -[\"']", " ", line)  # remove - -" - -'
        # remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, above/below vowels at start of word
        line = re.sub(r"(^|\s)[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", " ", line)
        line = normalize(line)  # normalize Thai char order
        line = re.sub(r"([ก-ฮ])([ ]*\1){6,}", r"\1\1\1\1\1\1\1", line)  # reduce duplicate consonants
        line = re.sub(r"^\s*\[([a-zA-Z0-9])+\]\s+", "", line)  # remove [xxx] at start of line
        line = re.sub(r"[()\[\]]", "", line)  # remove parentheses, brackets
        line = re.sub(r"(\.{2,}|…)", "", line)  # remove ellipsis
        line = re.sub(r"^([^\"]+)?[\"]([^\"]+)?$", r"\1\2", line)  # remove lone quotation mark
        line = re.sub(r"([\s!?:;,-])([ ]*\1)+", r"\1", line)  # remove duplicate symbol/space
        line = re.sub(r"[,;:-]\s*$", "", line)  # remove symbol at end of line
        line = re.sub(r"(^\s+|\s+$)", "", line)  # remove space at start/end of line
        line = line.strip()

        if len(line) > 0:
            outputf.write(line)
            outputf.write("\n")

####

# 3) split long text to clauses, fix possible errors

import re
from pythainlp import sent_tokenize

with open("bact-tweet-normalized.txt", "r") as inputf, open("bact-tweet-clause.txt", "w") as outputf:
    for line in inputf:
        line = line.strip()
        if len(line) < 2:
            continue

        if (len(line) <= 100) and (not re.search(r"[a-zA-Z0-9๐๑๒๓๔๕๖๗๘๙\u0e46\u0e3f$&()/\+=*@]", line)):
            sentence = line
            # remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, Maiyamok, follow/above/below vowels at start of sentence
            sentence = re.sub(r"^\s*[ะา\u0e45\u0e46\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", "", sentence)
            # remove lead vowel at end of sentence
            sentence = re.sub(r"[เแโไใ]+\s*$", "", sentence)
            sentence = re.sub(r"[,;:-]\s*$", "", sentence)  # remove symbol at end of line
            sentence = re.sub(r"\s+", " ", sentence)
            sentence = sentence.strip()

            outputf.write(line)
            outputf.write("\n")
        else:
            for sentence in sent_tokenize(line):
                sentence = re.sub(r"^([^\"]+)?[\"]([^\"]+)?$", r"\1\2", sentence)  # remove lone quotation mark
                # remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, Maiyamok, follow/above/below vowels at start of sentence
                sentence = re.sub(r"^\s*[ะา\u0e45\u0e46\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", "", sentence)
                # remove lead vowel at end of sentence
                sentence = re.sub(r"[เแโไใ]+\s*$", "", sentence)
                sentence = re.sub(r"[,;:-]\s*$", "", sentence)  # remove symbol at end of line
                sentence = re.sub(r"\s+", " ", sentence)
                sentence = sentence.strip()

                if len(sentence) >= 2 and len(sentence) <= 100:
                    if not re.search(r"[a-zA-Z0-9๐๑๒๓๔๕๖๗๘๙\u0e46\u0e3f$&()/\+=*@]", sentence):  # remove sentence with invalid char
                        outputf.write(sentence)
                        outputf.write("\n")
	# 1) export text from JSON

	import json
	import pandas as pd
	from pandas import json_normalize

	# note: have to remove `window.YTD...` in the first line
	data = json.load(open('bact-tweet.json'))
	df = pd.DataFrame.from_records(json_normalize(data))
	df = df[df["tweet.lang"] == "th"]
	df = df['tweet.full_text']
	df = df.str.strip()

	with open("bact-tweet.txt", "w") as f:
	_list = map(lambda x: x+"\n", df.tolist())
	f.writelines(_list)

	####

	# 2) normalize text

	import re
	from pythainlp.util import normalize

	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)

	with open("bact-tweet.txt", "r") as inputf, open("bact-tweet-normalized.txt", "w") as outputf:
	for line in inputf:
	line = re.sub(r"[\u200b\u200c\u2063\u200e]", "", line) # remove zero-width char
	line = re.sub(
	r"((https?\|ftp):((//)\|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)"
	r"\|(www\.([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)"
	, " ", line) # remove url
	line = re.sub(r"^\s*(((RT\|rt)\s)?(@[\w\d]+:?\s+))+", " ", line) # remove reply and RT
	line = re.sub(r"#", " ", line) # dehashtag
	line = emoji_pattern.sub(" ", line) # remove emoji
	line = re.sub(r"- -[\"']", " ", line) # remove - -" - -'
	# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, above/below vowels at start of word
	line = re.sub(r"(^\|\s)[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", " ", line)
	line = normalize(line) # normalize Thai char order
	line = re.sub(r"([ก-ฮ])([ ]*\1){6,}", r"\1\1\1\1\1\1\1", line) # reduce duplicate consonants
	line = re.sub(r"^\s*\[([a-zA-Z0-9])+\]\s+", "", line) # remove [xxx] at start of line
	line = re.sub(r"[()\[\]]", "", line) # remove parentheses, brackets
	line = re.sub(r"(\.{2,}\|…)", "", line) # remove ellipsis
	line = re.sub(r"^([^\"]+)?[\"]([^\"]+)?$", r"\1\2", line) # remove lone quotation mark
	line = re.sub(r"([\s!?:;,-])([ ]*\1)+", r"\1", line) # remove duplicate symbol/space
	line = re.sub(r"[,;:-]\s*$", "", line) # remove symbol at end of line
	line = re.sub(r"(^\s+\|\s+$)", "", line) # remove space at start/end of line
	line = line.strip()

	if len(line) > 0:
	outputf.write(line)
	outputf.write("\n")

	####

	# 3) split long text to clauses, fix possible errors

	import re
	from pythainlp import sent_tokenize

	with open("bact-tweet-normalized.txt", "r") as inputf, open("bact-tweet-clause.txt", "w") as outputf:
	for line in inputf:
	line = line.strip()
	if len(line) < 2:
	continue

	if (len(line) <= 100) and (not re.search(r"[a-zA-Z0-9๐๑๒๓๔๕๖๗๘๙\u0e46\u0e3f$&()/\+=*@]", line)):
	sentence = line
	# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, Maiyamok, follow/above/below vowels at start of sentence
	sentence = re.sub(r"^\s*[ะา\u0e45\u0e46\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", "", sentence)
	# remove lead vowel at end of sentence
	sentence = re.sub(r"[เแโไใ]+\s*$", "", sentence)
	sentence = re.sub(r"[,;:-]\s*$", "", sentence) # remove symbol at end of line
	sentence = re.sub(r"\s+", " ", sentence)
	sentence = sentence.strip()

	outputf.write(line)
	outputf.write("\n")
	else:
	for sentence in sent_tokenize(line):
	sentence = re.sub(r"^([^\"]+)?[\"]([^\"]+)?$", r"\1\2", sentence) # remove lone quotation mark
	# remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, Maiyamok, follow/above/below vowels at start of sentence
	sentence = re.sub(r"^\s*[ะา\u0e45\u0e46\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+", "", sentence)
	# remove lead vowel at end of sentence
	sentence = re.sub(r"[เแโไใ]+\s*$", "", sentence)
	sentence = re.sub(r"[,;:-]\s*$", "", sentence) # remove symbol at end of line
	sentence = re.sub(r"\s+", " ", sentence)
	sentence = sentence.strip()

	if len(sentence) >= 2 and len(sentence) <= 100:
	if not re.search(r"[a-zA-Z0-9๐๑๒๓๔๕๖๗๘๙\u0e46\u0e3f$&()/\+=*@]", sentence): # remove sentence with invalid char
	outputf.write(sentence)
	outputf.write("\n")