Created
August 9, 2017 12:46
-
-
Save mkolly/8cc5d7b59786a701172c34dc17baf394 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#Marie-José Kolly via NZZ Storytelling, July 2017: How we scraped the publication database of the Basel Committee on Banking Supervision and analyzed their supervisory texts in order to visualize quantity and quality of regulatory text over time | |
#Feedback welcome by e-mail marie-jose.kolly[at]nzz.ch or twitter [at]mjkolly | |
#Article presenting results: www.nzz.ch/ld.1304103 | |
#risk per file, corpus 3 | |
cd mypath/data/corpus3 | |
for file in $(ls -1 |grep -i .txt ) ; do printf "$file\t" && grep -o 'risk' $file | wc -l; done > /mypath/data/unixResults/riskPerFile_unix.txt | |
#words per file, corpus 2 | |
cd mypath/data/corpus2 | |
for file in $(ls -1 |grep -i .txt ) ; do printf "$file\t" && wc -w $file; done > /mypath/data/unixResults/nWordsPerFile_unix.txt | |
#word frequency, corpus 3 | |
cd mypath/data/corpus3 | |
cat *.txt > mypath/data/unixResults/allTextC3_unix.txt | |
cd unixResults | |
cat allTextC3_unix.txt|tr ' ' '\n'| sort | uniq -c | sort -rn > mypath/data/unixResults/wordFreqencyC3_unix.txt | |
#word frequency, corpus 2 | |
cd mypath/data/corpus2 | |
cat *.txt > mypath/data/unixResults/allTextC2_unix.txt | |
cd unixResults | |
cat allTextC2_unix.txt|tr ' ' '\n'| sort | uniq -c | sort -rn > mypath/data/unixResults/wordFreqencyC2_unix.txt | |
#collocations | |
cd mypath/data/unixResults | |
cat allTextC2_unix.txt | sed 's/,//' | sed G | tr ' ' '\n' > tmp.txt | |
tail -n+2 tmp.txt > tmp2.txt | |
paste -d ',' tmp.txt tmp2.txt | grep -v -e "^," | grep -v -e ",$" | sort | uniq -c | sort -rn > mypath/data/unixResults/bigramsFrequencyC2_unix.txt | |
#trigrams | |
cd mypath/data/unixResults | |
cat allTextC2_unix.txt | sed 's/,//' | sed G | tr ' ' '\n' > tmp.txt | |
tail -n+2 tmp.txt > tmp2.txt | |
tail -n+2 tmp2.txt > tmp3.txt | |
paste -d ',' tmp.txt tmp2.txt tmp3.txt | grep -v -e "^," | grep -v -e ",$" | grep -v -e ",," | sort | uniq -c | sort -rn > mypath/data/unixResults/trigramsFrequencyC2_unix.txt | |
#most important risk types per file, corpus 2 | |
cd mypath/data/corpus2 | |
for file in $(ls -1 |grep -i .txt ) ; do printf "$file\t" \ | |
&& grep -o '\<credit risk' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<market risk' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<operational risk' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<interest rate risk' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<liquidity risk' $file | wc -l; done > mypath/data/unixResults/riskTypesPerFileWithUnix_unix.txt | |
#modal verbs and constructions per file, corpus 2 | |
cd mypath/data/corpus2 | |
for file in $(ls -1 |grep -i .txt ) ; do printf "$file\t" \ | |
&& grep -o '\<should\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<should not\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<must\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<must not\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<has to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<have to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<does not have to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<do not have to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<shall\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<shall not\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<ought to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<ought not\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<need to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<needs to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<does not need to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<do not need to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<need not\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<is supposed to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<are supposed to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<is not supposed to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<are not supposed to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<is required to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<are required to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<is not required to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<are not required to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<is obliged to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<are obliged to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<is not obliged to\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<are not obliged to\>' $file | wc -l; done > mypath/data/unixResults/modVerbsPerFileWithUnix_unix.txt | |
#regulatory instruments per file, corpus 3 | |
for file in $(ls -1 |grep -i .txt ) ; do printf "$file\t" \ | |
&& grep -o '\<capital requirement' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<risk-weighted asset' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<risk weighted asset' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<supervisory review' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<stress test' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<internal capital adequacy assessment process' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<icaap\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<market discipline' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<disclosure requirement' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<disclosure standard' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<capital conservation buffer' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<leverage ratio' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<lr\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<total loss absorbing capacity' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<total loss absorbing capaciti' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<tlac\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<countercyclical capital buffer' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<countercyclical buffer' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<liquidity coverage ratio' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<lcr\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<high-quality liquid asset' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<hqla\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<net stable funding ratio' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<nsfr\>' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<available stable funding' $file | wc -l | tr -d '\n' && printf "\t" \ | |
&& grep -o '\<required stable funding' $file | wc -l; done > mypath/data/unixResults/instrumentsWithUnix_unix.txt | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment