Mte90/clean.sh

## clean.sh
#!/usr/bin/env bash

function extract() {
    echo "Estraggo $1.rar"
    unrar e "./$1.rar" \*.txt \*.wav './clips-mitads/' -o+
}

if [ ! -d './clips-mitads' ]; then
    mkdir './clips-mitads'
fi

extract "LETTO"
extract "RTV"
extract "DIALOGICO"
extract "ORTOFONICO"
# extract "TELEFONICO" inutile non ha file wav

cd ./clips-mitads/
rm -f ./*_tempi.txt
/bin/ls | /bin/grep -P "^.*[_abcdefghilmnopqrstuvz][abcdefghilmnopqrstuvz].wav$"  | xargs rm -f

counts=$(find './' -type f ! -name "*.txt" | wc -l)

echo "File txt da convertire $counts"

counts=$(find './' -type f ! -name "*.wav" | wc -l)

echo "File wav totali $counts"

## generate_csv.py
#!/usr/bin/env python

# Based on https://github.com/vslovik/LingITII/blob/master/scripts/cleaner.py`

"""
Convert transcribed text into csv
Usage examples:
    python generate_csv.py DGmtA01F.txt
"""

import sys
import re
import os

filename = os.path.basename(sys.argv[1])
filename = os.path.splitext(filename)[0]

with open(sys.argv[1]) as txt_file:
    text = ''.join([line for line in txt_file])
    lines = [line for line in text.split('\n') if line.strip() != '']
    i = -1
    for line in lines:
        i = i + 1
        if not line[0].isupper():
            break
    del lines[0:i]

    text = ''.join([line for line in lines])\
                        .replace('<sp>', '')\
                        .replace('<lp>','')\
                        .replace('<inspiration>','')\
                        .replace('<tongue-click>','')\
                        .replace('<creacky-voice>','')\
                        .replace('<NOISE>','')\
                        .replace('[screaming]','')\
                        .replace(',','')\
                        .replace('!','')\
                        .replace('?','')\
                        .replace('*','')\
                        .replace('{','')\
                        .replace('}','')\
                        .replace('[whispering]','')\
                        .replace('<laugh>','')\
                        .replace('#','')\
                        .replace('<eeh>','')\
                        .replace('<mh>','')\
                        .replace('<laugh>','')\
                        .replace('<mhmh>','')\
                        .replace('<ahah>',' ')\
                        .replace('<ah>','ah')\
                        .replace('<eh>','eh')\
                        .replace('<ah>','ah')\
                        .replace('<ehm>','ehm')\
                        .replace('<oh>','oh')\
                        .replace('+',' ')\
                        .replace('[dialect]',' ')\
                        .replace('<oo>',' ')\
                        .replace('/',' ')\
                        .replace("\r",'')

    text = re.sub(r'\s+', ' ', text)
    text = re.sub('p(\w{2}): ', r'\n' + filename + r'#\1.wav,', text)

    text = ''.join([line for line in text.split('\n') if text.strip() != ''])
print(text)
	#!/usr/bin/env bash

	function extract() {
	echo "Estraggo $1.rar"
	unrar e "./$1.rar" \.txt \.wav './clips-mitads/' -o+
	}

	if [ ! -d './clips-mitads' ]; then
	mkdir './clips-mitads'
	fi

	extract "LETTO"
	extract "RTV"
	extract "DIALOGICO"
	extract "ORTOFONICO"
	# extract "TELEFONICO" inutile non ha file wav

	cd ./clips-mitads/
	rm -f ./*_tempi.txt
	/bin/ls \| /bin/grep -P "^.*[_abcdefghilmnopqrstuvz][abcdefghilmnopqrstuvz].wav$" \| xargs rm -f

	counts=$(find './' -type f ! -name "*.txt" \| wc -l)

	echo "File txt da convertire $counts"

	counts=$(find './' -type f ! -name "*.wav" \| wc -l)

	echo "File wav totali $counts"
	#!/usr/bin/env python

	# Based on https://github.com/vslovik/LingITII/blob/master/scripts/cleaner.py`

	"""
	Convert transcribed text into csv
	Usage examples:
	python generate_csv.py DGmtA01F.txt
	"""

	import sys
	import re
	import os

	filename = os.path.basename(sys.argv[1])
	filename = os.path.splitext(filename)[0]

	with open(sys.argv[1]) as txt_file:
	text = ''.join([line for line in txt_file])
	lines = [line for line in text.split('\n') if line.strip() != '']
	i = -1
	for line in lines:
	i = i + 1
	if not line[0].isupper():
	break
	del lines[0:i]

	text = ''.join([line for line in lines])\
	.replace('<sp>', '')\
	.replace('<lp>','')\
	.replace('<inspiration>','')\
	.replace('<tongue-click>','')\
	.replace('<creacky-voice>','')\
	.replace('<NOISE>','')\
	.replace('[screaming]','')\
	.replace(',','')\
	.replace('!','')\
	.replace('?','')\
	.replace('*','')\
	.replace('{','')\
	.replace('}','')\
	.replace('[whispering]','')\
	.replace('<laugh>','')\
	.replace('#','')\
	.replace('<eeh>','')\
	.replace('<mh>','')\
	.replace('<laugh>','')\
	.replace('<mhmh>','')\
	.replace('<ahah>',' ')\
	.replace('<ah>','ah')\
	.replace('<eh>','eh')\
	.replace('<ah>','ah')\
	.replace('<ehm>','ehm')\
	.replace('<oh>','oh')\
	.replace('+',' ')\
	.replace('[dialect]',' ')\
	.replace('<oo>',' ')\
	.replace('/',' ')\
	.replace("\r",'')

	text = re.sub(r'\s+', ' ', text)
	text = re.sub('p(\w{2}): ', r'\n' + filename + r'#\1.wav,', text)

	text = ''.join([line for line in text.split('\n') if text.strip() != ''])
	print(text)