Created
March 6, 2020 18:39
-
-
Save llamasoft/f2e12cd0cbe65fa9691f52f043f20d3c to your computer and use it in GitHub Desktop.
Fetches and parses the Destiny 2 assets library into a wordlist.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fetching the Destiny 2 manifest definition | |
api_key="" | |
if [[ -n "${api_key}" ]]; then | |
curl --compressed "https://www.bungie.net/Platform/Destiny2/Manifest/" -H "x-api-key: ${api_key}" -o "manifest.json" | |
# Extracting the name of the aggregated JSON asset database. | |
aggregate_path=$(jq --raw-output '.Response.jsonWorldContentPaths.en' "manifest.json") | |
else | |
# Use a recent aggregated JSON file as a fall-back. | |
aggregate_path="/common/destiny2_content/json/en/aggregate-2fbe1829-dfcd-44ec-84d3-bb04a3777dc1.json" | |
fi | |
aggregate_name=$(basename "${aggregate_path}") | |
if [[ ! -f "${aggregate_name}" ]]; then | |
# The aggregate JSON file is rather large, only download it if required | |
curl -LO --compressed "https://www.bungie.net/${aggregate_path}" | |
fi | |
# Recursively extracting all values from keys with the names "description", "subtitle", or "name" | |
jq --raw-output ' | |
.. | |
| objects | |
| with_entries(select(.key == ("description","subtitle","name"))) | |
| select(. != {}) | |
| to_entries | |
| .[] | |
| .value | |
' "${aggregate_name}" > "destiny.txt" | |
# There are a lot of fully duplicated lines because of descriptions. | |
# Before we take a frequency, make sure each line is unique and non-empty. | |
sort -u "destiny.txt" | grep -v '^[ ]*$' > "destiny.uniq" | |
# Ok, this is a big one. | |
# The input file contains a lot of Unicode punctuation that needs to be dumbed down to ASCII if possible. | |
# If we can't transliterate to ASCII, then just zap the character to a space. | |
# Split the transliterated output into words (alpha-numeric, dash, or single-quote) characters. | |
# Remove lines that are entirely numeric or begin/end with punctuation. | |
# If a word appears in multiple cases, only keep the all-lowercase version if present. | |
# Proper nouns will always appear capitalized, but regular words may be capitalized if they start a sentence. | |
# This prevents duplicates of regular words that just happen to show up at the start of a sentence. | |
cat "destiny.uniq" \ | |
| uconv --fallback -f UTF8 -t UTF8 -x ':: Any-Publishing; :: [[:Punctuation:][:Symbol:]] Latin-ASCII;' --callback skip \ | |
| tr -c "A-Za-z0-9-'" '\n' \ | |
| grep -v -e '^[0-9]*$' -e "^[-']" -e "[-']$" \ | |
| awk '$0 { freq[$0]++; } | |
END { | |
for (word in freq) { | |
if ( tolower(word) == word || !(tolower(word) in freq) ) { | |
printf("%5d %s\n", freq[word], word); | |
} | |
} | |
}' | sort -rn > "destiny.freq" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment