Skip to content

Instantly share code, notes, and snippets.

@MartinNowak
Last active September 23, 2016 15:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MartinNowak/d9f1176a7a15f3ffe0239e3f48f4f5d2 to your computer and use it in GitHub Desktop.
Save MartinNowak/d9f1176a7a15f3ffe0239e3f48f4f5d2 to your computer and use it in GitHub Desktop.
get corpora for different languages
#!/bin/bash
set -euo pipefail
tmp=$(mktemp -d)
cleanup() {
rm -rf "$tmp"
}
trap cleanup EXIT
wget() {
command wget --header="Authorization: token $GH_TOKEN" "$@"
}
mkdir -p corpora
for lang in D DTrace Makefile; do
if [ ! -f corpora/$lang.repos ]; then
echo "Getting top $lang repos"
wget -qO- "https://api.github.com/search/repositories?q=language:$lang&sort=stars&per_page=100" | jq -r '.items | .[] | .full_name' > corpora/$lang.repos
mkdir -p corpora/$lang
fi
done
for lang in D DTrace Makefile; do
for repo in $(cat corpora/$lang.repos); do
echo "Fetching $lang $repo"
branch=$(wget -qO- https://api.github.com/repos/$repo | jq -r '.default_branch')
wget -qO- --show-progress https://github.com/$repo/archive/$branch.tar.gz | tar -C $tmp --strip-components=1 -zxf -
mkdir -p corpora/$lang/$repo
find $tmp -name '*.d' -type f -not -empty -exec mv {} corpora/$lang/$repo/ \;
# purge tmp
find $tmp -mindepth 1 -delete
# delete binary data and convert rest to UTF-8
for file in $(find corpora/$lang/$repo -name '*.d' -type f); do
encoding=$(file --brief --mime-encoding "$file")
if [ "$encoding" == "binary" ] || [[ "$encoding" == "unknown"* ]]; then
rm "$file"
else
iconv --from="$encoding" --to=UTF-8 "$file" --output="$file.tmp"
mv "$file.tmp" "$file"
fi
done
# delete repo from todo list, escape / in regex pattern
sed -i "/${repo/\//\\/}/d" corpora/$lang.repos
done
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment