rom1504/a_download_cah_from_theeye.md

## a_download_cah_from_theeye.md

      
    Raw
  

              a_download_cah_from_theeye.md
            
          
    This is about downloading http://the-eye.eu/eleuther_staging/cah/ which is a big dataset of
image/text pairs filtered from common crawl

run get_links.sh ; this will produce a to_aria.txt file which contains all the urls to download and where to put them
run download.sh ; it will use aria2c to download files fast (takes about 1h)

Note if you only want one type of file, you may change this part grep 'csv\|txt\|pkl\|tfrecord'

  
## download_cah.sh
aria2c --dir=output --auto-file-renaming=false --continue=true -i for_aria -x 16 -s 16 -j 16

## get_links.sh
lynx -dump -hiddenlinks=listonly -nonumbers http://the-eye.eu/eleuther_staging/cah/ | rg "/cah/.*/$" > links.txt
# clean out file between runs
echo "" > links2.txt
# skip first line as the "presorted" url only has 1 level
for URL in $(tail links.txt -n+2)
do
    lynx -dump -hiddenlinks=listonly -nonumbers $URL | rg "$URL.*/$" >> links2.txt
done
# add the "presorted" url to the next level of crawling
head -n1 links.txt >> links2.txt
# clean out file between runs
echo "" > download_urls.txt

for URL in $(cat links2.txt)
do
    lynx -dump -hiddenlinks=listonly -nonumbers $URL | rg cah | grep 'csv\|txt\|pkl\|tfrecord'  >> download_urls.txt
done
python3 to_aria.py

## to_aria.py
import os
if not os.path.exists("output"):
    os.mkdir("output")

filelist = open("download_urls.txt", "r").read().split("\n")
with open("for_aria.txt", "w") as f:
    for fil in filelist:
        if fil == "":
            continue
        output_dir = "output" + "/" + "/".join(fil.split("/")[4:][:-1])

        f.write(fil+"\n")
        f.write(" dir="+output_dir+"\n")
        f.write(" continue=true\n")
        f.write(" max-connection-per-server=16\n")
        f.write(" split=16\n")
        f.write(" min-split-size=20M\n\n")
	lynx -dump -hiddenlinks=listonly -nonumbers http://the-eye.eu/eleuther_staging/cah/ \| rg "/cah/.*/$" > links.txt
	# clean out file between runs
	echo "" > links2.txt
	# skip first line as the "presorted" url only has 1 level
	for URL in $(tail links.txt -n+2)
	do
	lynx -dump -hiddenlinks=listonly -nonumbers $URL \| rg "$URL.*/$" >> links2.txt
	done
	# add the "presorted" url to the next level of crawling
	head -n1 links.txt >> links2.txt
	# clean out file between runs
	echo "" > download_urls.txt

	for URL in $(cat links2.txt)
	do
	lynx -dump -hiddenlinks=listonly -nonumbers $URL \| rg cah \| grep 'csv\\|txt\\|pkl\\|tfrecord' >> download_urls.txt
	done
	python3 to_aria.py
	import os
	if not os.path.exists("output"):
	os.mkdir("output")

	filelist = open("download_urls.txt", "r").read().split("\n")
	with open("for_aria.txt", "w") as f:
	for fil in filelist:
	if fil == "":
	continue
	output_dir = "output" + "/" + "/".join(fil.split("/")[4:][:-1])

	f.write(fil+"\n")
	f.write(" dir="+output_dir+"\n")
	f.write(" continue=true\n")
	f.write(" max-connection-per-server=16\n")
	f.write(" split=16\n")
	f.write(" min-split-size=20M\n\n")