Last active
October 16, 2022 17:35
-
-
Save antonrasmussen/edf19bc6582b2576adf3b8acd18c7f38 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
home_dir="./fall22-hw2-antonrasmussen/" | |
search_term_array=("virginia" "beach" "coronavirus" "dall-e" "openai") | |
cd $home_dir | |
for search_term in "${search_term_array[@]}" | |
do | |
/opt/homebrew/bin/python3 $home_dir/collect-tweets.py $search_term | |
/opt/homebrew/bin/python3 $home_dir/process-tweets.py < tweets.jsonl > tweets-info.txt | |
link_pref="https:/" | |
# get only https links | remove twitter links | remove .ly links | remove youtube links | remove instagram | get both www and non www | |
declare -a link_sufx=$(cat tweets-info.txt | grep https | grep -v twitter | grep -v ".ly" | grep -v youtu | grep -v instagram | cut -f3- -d "/" ) | |
for link in $link_sufx | |
do | |
# Stich the links back together | |
link="$link_pref/$link" | |
echo $link >> link_list_unsorted.txt | |
done | |
done | |
# unsorted list will grow but link_list will always be new unique and sorted | |
cat link_list_unsorted.txt | sort | uniq > link_list.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment