dchaplinsky/instructions_retrieval.sh

## instructions_retrieval.sh
#!/bin/bash

# You will need `apt get parallel pv` to make it run

# download file containing urls
curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt

# create output file
touch output.txt

# use parallel command to download/grep in parallel
cat urls.txt | pv -cN Input | parallel -j 4 "curl -s {} | zcat | grep -e '<http://schema.org/FAQPage>' -e '<http://schema.org/HowTo>'" | pv -cN Output > output.txt
	#!/bin/bash

	# You will need `apt get parallel pv` to make it run

	# download file containing urls
	curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt

	# create output file
	touch output.txt

	# use parallel command to download/grep in parallel
	cat urls.txt \| pv -cN Input \| parallel -j 4 "curl -s {} \| zcat \| grep -e '<http://schema.org/FAQPage>' -e '<http://schema.org/HowTo>'" \| pv -cN Output > output.txt