cat files | tr ' ' '\n' | sort | uniq | wc -w
cat corpus_file | tr ' ' '\n' | sort | uniq -c > outfile.freq
sed -n '16224,16482p' filename > newfile
sed -i <file> -re '<start>,<end>d'
sed -i filename -re '16224,16482d'
sed -i 's/^ *//' your_file
sed -i -e 's/A/B/g' file
-i option is used to edit in place on the file
-e option indicates the expression/command to run, in this case s/.
tr -d ' ' < input.txt > output.txt
sed -i 's/ */ /g'file
sed -e 's/َ//g' -e 's/ُ//g' -e 's/ِ//g' -e 's/ّ//g' -e 's/ً//g' -e 's/ٌ//g' -e 's/ٍ//g' -e 's/ْ//g' file
sed -i -e "s/[[:punct:]]\+//g" file
sed -i -e 's/؟//g' -e 's/،//g' -e 's/؛//g' file
sed -n 's:.*<s>\(.*\)</s>.*:\1:p' file > outfile
cat file | sed 's/^[ \t]*//;s/[ \t]*$//' > outfile
sed -n 's:.*<s>\(.*\)</s>.*:\1:p' file | sed 's/^[ \t]*//;s/[ \t]*$//' > outfile
sed -e 's/[(][^)]*[)]//g' file
sed 's/\w*$//' file
awk '{print $0 " suffix"}' infile
split -l 8 -a 4 -d file.ext
Here each file contains 8 lines
split -l 1 -a 4 -d file.ext a
split -l $[ $(wc -l filename|cut -d" " -f1) * 90 / 100 ] filename
for f in * ; do mv "$f" "PRE_$f" ; done
ls -d dir/* > file.list
for f in *.ext; do mv "$f" "${f// /_}"; done
or
rename 's/ /_/' *.ext
dpkg-reconfigure locales
cut -c 4-10 file
find -name "*" | xargs grep 'search keyword'
find -name "*.ext"
perl -ne 'print $1."\n" if /<seg[^>]+>\s*(.*\S)\s*<.seg>/i;' < input > output
perl -ne '$string = $1."\n" if /<seg[^>]+>\s*(.*\S)\s*<.seg>/i;$string =~ s/^\s+//; $string =~ tr{\n}{ };print $string;' < input > output
perl -ne '$string = $1 if /<seg[^>]+>\s*(.*\S)\s*<.seg>/i;$string =~ s/^\s+//; $string =~ tr{\n}{ };print $string."\n";' < input > output
sort myfile.txt | uniq -u
paste -d : f1.txt f2.txt
xargs -a file_list.txt mv -t /path/to/dest
xargs -a ls.txt mv -t dir/
ls -l dir/* | grep 'Oct' | awk '{print $9}' | xargs cp -t dir/
find dir/ -name "*" -exec cat "{}" >> outfile \;
xml_split -l 1 dir/*
grep '</node_name>' yourfile.xml -o | wc -l
for i in *; do sed -n 2p "$i"; done
sed -n '10,20p' <filename>
for i in dir/*; do cat $i ; printf 'SEPARATOR'; done > myfile.ext
rename "s/\s+//g" *
sed -i '/^\s*$/d' file
sed -i '/^[[:space:]]*$/d' file
perl -i -p -e 's/\n/ /' file
sed 's/[0-9]*//g' file
sed -i 's/string1/string2/g' file
ls -d dir/*
ls -d dir/* | sed -n 's/\.ext//p'
sox infile.wav -r 16000 outfile.wav
cd wavdir
for i in *.wav; do sox $i -r 16000 ../outdir/$i ; done
for f in *.mp3; do wav_name="${f%.*}"; ffmpeg -i $f -ar 16000 -ac 1 ${wav_name}.wav; done
total_duration=0.0
for file in *.wav
do
duration=$(sox --i -D "$file")
total_duration=$(python -c "print($total_duration+$duration)")
s_rate=$(sox --i -r "$file")
channels=$(sox --i -c "$file")
filename=$(basename "$file")
#printf "duration: %s sample rate: %s channels: %d file:%s\n" "$duration" "$s_rate" "$channels" "$filename"
done
printf "total duration: in minutes: %.2f minutes \t in hours: %.2f hours\n" \
$(python -c "print($total_duration/60)") $(python -c "print($total_duration/60/60)")
cat file | cut -d "[" -f2 | cut -d "]" -f1 > outfile
- determine file encoding
file -i file.ext
- convert encoding
iconv -f from-encoding -t to-encoding inputfile(s) -o outputfile
sort filename | uniq -c | sort -nr
tar -zcvf archive-name.tar.gz directory-name
Where,
- -z: Compress archive using gzip program
- -c: Create archive
- -v: Verbose i.e display progress while creating archive
- -f: Archive File name
tar -zxvf archive-name.tar.gz directory-name
Where,
- -x: Extract files
sudo swapoff -a
sudo swapon -a
for i in *; do lame --scale 3 "$i" ../out/"$i"; done
Build and test language models using SRILM tool
-unk Build an ``open vocabulary'' LM, i.e., one that contains the unknown-word token as a regular word. The default is to remove the unknown word.
-kndiscountn where n is 1, 2, 3, 4, 5, 6, 7, 8, or 9. Use Chen and Goodman's modified Kneser-Ney discounting for N-grams of order n.
ngram-count -order 3 -vocab corpus.vocab -text corpus.txt -lm corpus.lm -unk -kndiscount2 -kndiscount3
ngram -lm corpus.lm -ppl test.txt -unk
Training with SRILM is easy. Morever, SRILM is the most advanced toolkit up to date. To train the model you can use the following command:
ngram-count -kndiscount -interpolate -text train-text.txt -lm your.lm
-interpolaten option where n is 1, 2, 3, 4, 5, 6, 7, 8, or 9. Causes the discounted N-gram probability estimates at the specified order n to be interpolated with lower-order estimates.
You can prune the model afterwards to reduce the size of the model
ngram -lm your.lm -prune 1e-8 -write-lm your-pruned.lm
After training it is worth to test the perplexity of the model on the test data
ngram -lm your.lm -ppl test-text.txt
-
Prepare a reference text that will be used to generate the language model. The language model toolkit expects its input to be in the form of normalized text files, with utterances delimited by
<s>
and</s>
tags. -
Generate the vocabulary file. This is a list of all the words in the file:
text2wfreq < speech.txt | wfreq2vocab > corpus.vocab
-
You may want to edit the vocabulary file to remove words (numbers, misspellings, names). If you find misspellings, it is a good idea to fix them in the input transcript.
-
If you want a closed vocabulary language model (a language model that has no provisions for unknown words), then you should remove sentences from your input transcript that contain words that are not in your vocabulary file.
-
Generate the arpa format language model with the commands:
text2idngram -vocab corpus.vocab -idngram corpus.idngram < corpus.closed.txt
idngram2lm -vocab_type 0 -idngram corpus.idngram -vocab corpus.vocab -arpa corpus.lm
- Generate the CMU binary form (BIN)
sphinx_lm_convert -i corpus.lm -o corpus.lm.bin
Your contributions to improve the code are welcomed. Please follow the steps below.
- Fork the project.
- Modify the code, test it, make sure that it works fine.
- Make a pull request.
Please consult github help to get help.