dotcomboom/2krepo.py Secret

## 2krepo.py
# this script scrapes w2krepo.somnolescent.net and downloads the indexes of every file into a json listing
#
# beautifulsoup4
from bs4 import BeautifulSoup
# requests
import requests
# json
import json
# os
import os
# sys
import sys
import re
import time

def sitemap():
    # load index.json into index
    with open("index.json", "r") as f:
        index = json.load(f)

    # write an html sitemap out of the data
    with open("sitemap.html", "w") as f:
        f.write("<html><head><title>w2krepo : Index</title></head><body text=""#FFFFFF"" link=""#FFFFFF"" vlink=""#FFFFFF"" bgcolor=""#3A6EA5"" alink=""#C0C0C0"">")
        # make a table of the index separated by section

        #UNIQUE list of sections
        sections = []
        for i in index:
            if index[i]["section"] not in sections:
                sections.append(index[i]["section"])
        # sort
        sections.sort()
        print(sections)
        f.write("""
        <font face="Trebuchet MS" size="1">
        <a href="/" target="_blank"><b>w2krepo</b></a>&nbsp;&nbsp;
        <a href="./" target="_top">Exit frames</a>&nbsp;&nbsp;
        <a href="/Submissions">Contributions</a>
        </font>
        <font face="Trebuchet MS">
        """)
        # write table of contents
        f.write("<ul>")
        for i in sections:
            i = i.replace('http://w2krepo.somnolescent.net/', '')
            f.write("<li><a href='#" + i + "'>" + i.replace('%20', ' ') + "</a></li>")
        f.write("</ul>")

        f.write("<p><a href=""https://gist.github.com/dotcomboom/580521247ed24430fb7c75896d7938de"">Generated</a> at {0}</p>".format(time.strftime("%Y-%m-%d %H:%M:%S")))

        # make a table
        for section in sections:
            f.write("<h2>" + section.replace('http://w2krepo.somnolescent.net/', '').replace('%20', ' ') + "<a name=" + section.replace('http://w2krepo.somnolescent.net/', '') + "></a></h2>")
            f.write("<table>")
            f.write("<tr><th>File</th><th>Date</th><th>Time</th><th>Size</th><th>Description</th></tr>")

            for file in sorted(index.keys()):
                if index[file]["section"] == section:
                    f.write("<tr>")
                    f.write("<td><a href=\"" + file.replace('http://w2krepo.somnolescent.net/', '/') + "\">" + file.replace(section + '/', '').replace(section, '') + "</a></td>")
                    f.write("<td>" + index[file]["date"] + "</td>")
                    f.write("<td>" + index[file]["time"] + "</td>")
                    f.write("<td>" + index[file]["size"] + "</td>")
                    f.write("<td>" + index[file]["desc"] + "</td>")
                    f.write("</tr>")
            f.write("</table>")


        # f.write("<ul>")
        # for key in index:
        #     #try:
        #     f.write("<li><a href='" + key + "'>" + index[key]['section'] + ": " + key + " (" + index[key]['desc'] + ")" + "</a></li>")
        #     ##except KeyError:
        #       #  print('keyerr: ' + key)
        # f.write("</ul>")
        f.write("</font></body></html>")

        gopher()

def gopher():
    with open("index.json", "r") as f:
        index = json.load(f)
    # writes a txt file with gopher markup
    # each section
    with open("gopherstyle.txt", "w") as f:
        # each section, have a little heading and then a list of files
        #UNIQUE list of sections
        sections = []
        for i in index:
            if index[i]["section"] not in sections:
                sections.append(index[i]["section"])
        # sort
        sections.sort()
        print(sections)
        for section in sections:
            f.write("1<< Back to gopher.somnolescent.net\t/\n")
            f.write("1< Back to w2krepo\t/w2krepo\n")
            f.write('w2krepo on Gopher\n')
            friendlysection = section.replace('http://w2krepo.somnolescent.net/', '').replace('%20', ' ')
            f.write(friendlysection + '\n')
            f.write('--------------------------------------------------------------\n')
            # http-only file store
            http = {}
            # write files in that section
            for file in sorted(index.keys()):
                if index[file]["section"] == section:
                    # type
                    type = '9'
                    # if file ends in zip it is 5
                    if file.endswith('.zip'):
                        type = '5'
                    if 'M' in index[file]["size"] and float(index[file]["size"].replace('M', '')) > 40:
                        http[file] = (index[file])
                    else:
                        # check if there's no description and if so, use the filename
                        if index[file]["desc"] == "":
                            f.write(type + file.replace(section, '').replace('%20', ' ') + '\t\n')
                        else:
                            f.write(type + index[file]["desc"] + '\t' + file.replace(section + '/', '').replace(section, '') + '\n')
            # write http-only files
            if len(http) > 0:
                f.write('\nFiles in this repo available over HTTP only:\n')
            for file in http.keys():
                if http[file]["desc"] == "":
                    http[file]["desc"] = file.replace(section, '').replace('%20', ' ')
                f.write('h' + http[file]["desc"] + '\tURL:' + file + '\n')
            f.write('\n')
            f.write('\n')


if input('do you want to just gen the sitemap') == 'y':
    sitemap()
    sys.exit()

# categories url
url = "http://w2krepo.somnolescent.net/categories.html"

# get all the sections they are links
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
sections = soup.find_all("a")
# extract the links
sections = [x.get("href") for x in sections]
# remove start.html
sections = [x for x in sections if x != "start.html"]
# remove http://w2krepo.somnolescent.net/Submissions
sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/Submissions"]
# add http://w2krepo.somnolescent.net/Submissions/Queue
#sections.append("http://w2krepo.somnolescent.net/Submissions/Queue")

sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/changelog.txt"]
sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/sitemap.html"]

print(sections)

# quickly scrape each to see if there are more directories
for section in sections:
    print('checking ' + section + ' for more directories')
    r = requests.get(section)
    soup = BeautifulSoup(r.text, "html.parser")
    links = soup.find_all("a")
    for link in links:
        if link.get("href").endswith("/") and not link.get("href") == "./" and not link.get("href") == "/" and not link.get("href").startswith('https://') and not link.get('href').startswith("http://") and not link.get("href").startswith('/'):
            nurl = section
            if not link.get('href').startswith("/"):
                nurl += "/"
            nurl += link.get("href").replace('//', '/')
            sections.append(nurl)

index = {}

# crawl the directories
for section in sections:
    print(section)
    r = requests.get(section)
    soup = BeautifulSoup(r.text, "html.parser")
    # get pre
    pre = soup.find("pre")
    # get the line with each link and print it
    for line in pre.text.split("\n"):
        line = line.split()
        # format
        # ['ZoomIt', '1.15.zip', '2020-08-07', '20:13', '43K', 'ZoomIt', '1.15']
        # figure out what the index of the date is
        # the date is in the format ####-##-##
        date = ''
        name = ''
        iitime = ''
        size = ''
        desc = ''
        for i in range(len(line)):
            # regex for date: ^\d{4}-\d{2}-\d{2}$
            # use regex to check if is part is a date
            if re.match(r'^\d{4}-\d{2}-\d{2}$', line[i]) is not None:
                date = i
                # the strings before the date are the name of the file
                name = ' '.join(line[:date])
                # first string after the date is the itime
                iitime = line[date + 1]
                # second after the date is the size
                size = line[date + 2]
                # the rest is the description
                desc = ' '.join(line[date + 3:]).replace(" info", "")
                # skip if size is -
                if size == "-":
                    break
                # add the file to the index
                nurl = section
                if not name.startswith("/"):
                    nurl += "/"
                nurl += name

                nurl = nurl.replace('//', '/')
                nurl = nurl.replace('http:/', 'http://')

                if not nurl.endswith("/"):
                    index[nurl] = {
                        "date": line[date],
                        "time": itime,
                        "size": size,
                        "desc": desc,
                        "section": section.replace('//', '/').replace('http:/', 'http://')
                    }
                # break out of the for loop
                break

# write index into pretty json
with open("index.json", "w") as f:
    json.dump(index, f, indent=4)
sitemap()
	# this script scrapes w2krepo.somnolescent.net and downloads the indexes of every file into a json listing
	#
	# beautifulsoup4
	from bs4 import BeautifulSoup
	# requests
	import requests
	# json
	import json
	# os
	import os
	# sys
	import sys
	import re
	import time

	def sitemap():
	# load index.json into index
	with open("index.json", "r") as f:
	index = json.load(f)

	# write an html sitemap out of the data
	with open("sitemap.html", "w") as f:
	f.write("<html><head><title>w2krepo : Index</title></head><body text=""#FFFFFF"" link=""#FFFFFF"" vlink=""#FFFFFF"" bgcolor=""#3A6EA5"" alink=""#C0C0C0"">")
	# make a table of the index separated by section

	#UNIQUE list of sections
	sections = []
	for i in index:
	if index[i]["section"] not in sections:
	sections.append(index[i]["section"])
	# sort
	sections.sort()
	print(sections)
	f.write("""
	<font face="Trebuchet MS" size="1">
	<a href="/" target="_blank"><b>w2krepo</b></a>
	<a href="./" target="_top">Exit frames</a>
	<a href="/Submissions">Contributions</a>
	</font>
	<font face="Trebuchet MS">
	""")
	# write table of contents
	f.write("<ul>")
	for i in sections:
	i = i.replace('http://w2krepo.somnolescent.net/', '')
	f.write("<li><a href='#" + i + "'>" + i.replace('%20', ' ') + "</a></li>")
	f.write("</ul>")

	f.write("<p><a href=""https://gist.github.com/dotcomboom/580521247ed24430fb7c75896d7938de"">Generated</a> at {0}</p>".format(time.strftime("%Y-%m-%d %H:%M:%S")))

	# make a table
	for section in sections:
	f.write("<h2>" + section.replace('http://w2krepo.somnolescent.net/', '').replace('%20', ' ') + "<a name=" + section.replace('http://w2krepo.somnolescent.net/', '') + "></a></h2>")
	f.write("<table>")
	f.write("<tr><th>File</th><th>Date</th><th>Time</th><th>Size</th><th>Description</th></tr>")

	for file in sorted(index.keys()):
	if index[file]["section"] == section:
	f.write("<tr>")
	f.write("<td><a href=\"" + file.replace('http://w2krepo.somnolescent.net/', '/') + "\">" + file.replace(section + '/', '').replace(section, '') + "</a></td>")
	f.write("<td>" + index[file]["date"] + "</td>")
	f.write("<td>" + index[file]["time"] + "</td>")
	f.write("<td>" + index[file]["size"] + "</td>")
	f.write("<td>" + index[file]["desc"] + "</td>")
	f.write("</tr>")
	f.write("</table>")


	# f.write("<ul>")
	# for key in index:
	# #try:
	# f.write("<li><a href='" + key + "'>" + index[key]['section'] + ": " + key + " (" + index[key]['desc'] + ")" + "</a></li>")
	# ##except KeyError:
	# # print('keyerr: ' + key)
	# f.write("</ul>")
	f.write("</font></body></html>")

	gopher()

	def gopher():
	with open("index.json", "r") as f:
	index = json.load(f)
	# writes a txt file with gopher markup
	# each section
	with open("gopherstyle.txt", "w") as f:
	# each section, have a little heading and then a list of files
	#UNIQUE list of sections
	sections = []
	for i in index:
	if index[i]["section"] not in sections:
	sections.append(index[i]["section"])
	# sort
	sections.sort()
	print(sections)
	for section in sections:
	f.write("1<< Back to gopher.somnolescent.net\t/\n")
	f.write("1< Back to w2krepo\t/w2krepo\n")
	f.write('w2krepo on Gopher\n')
	friendlysection = section.replace('http://w2krepo.somnolescent.net/', '').replace('%20', ' ')
	f.write(friendlysection + '\n')
	f.write('--------------------------------------------------------------\n')
	# http-only file store
	http = {}
	# write files in that section
	for file in sorted(index.keys()):
	if index[file]["section"] == section:
	# type
	type = '9'
	# if file ends in zip it is 5
	if file.endswith('.zip'):
	type = '5'
	if 'M' in index[file]["size"] and float(index[file]["size"].replace('M', '')) > 40:
	http[file] = (index[file])
	else:
	# check if there's no description and if so, use the filename
	if index[file]["desc"] == "":
	f.write(type + file.replace(section, '').replace('%20', ' ') + '\t\n')
	else:
	f.write(type + index[file]["desc"] + '\t' + file.replace(section + '/', '').replace(section, '') + '\n')
	# write http-only files
	if len(http) > 0:
	f.write('\nFiles in this repo available over HTTP only:\n')
	for file in http.keys():
	if http[file]["desc"] == "":
	http[file]["desc"] = file.replace(section, '').replace('%20', ' ')
	f.write('h' + http[file]["desc"] + '\tURL:' + file + '\n')
	f.write('\n')
	f.write('\n')


	if input('do you want to just gen the sitemap') == 'y':
	sitemap()
	sys.exit()

	# categories url
	url = "http://w2krepo.somnolescent.net/categories.html"

	# get all the sections they are links
	r = requests.get(url)
	soup = BeautifulSoup(r.text, "html.parser")
	sections = soup.find_all("a")
	# extract the links
	sections = [x.get("href") for x in sections]
	# remove start.html
	sections = [x for x in sections if x != "start.html"]
	# remove http://w2krepo.somnolescent.net/Submissions
	sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/Submissions"]
	# add http://w2krepo.somnolescent.net/Submissions/Queue
	#sections.append("http://w2krepo.somnolescent.net/Submissions/Queue")

	sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/changelog.txt"]
	sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/sitemap.html"]

	print(sections)

	# quickly scrape each to see if there are more directories
	for section in sections:
	print('checking ' + section + ' for more directories')
	r = requests.get(section)
	soup = BeautifulSoup(r.text, "html.parser")
	links = soup.find_all("a")
	for link in links:
	if link.get("href").endswith("/") and not link.get("href") == "./" and not link.get("href") == "/" and not link.get("href").startswith('https://') and not link.get('href').startswith("http://") and not link.get("href").startswith('/'):
	nurl = section
	if not link.get('href').startswith("/"):
	nurl += "/"
	nurl += link.get("href").replace('//', '/')
	sections.append(nurl)

	index = {}

	# crawl the directories
	for section in sections:
	print(section)
	r = requests.get(section)
	soup = BeautifulSoup(r.text, "html.parser")
	# get pre
	pre = soup.find("pre")
	# get the line with each link and print it
	for line in pre.text.split("\n"):
	line = line.split()
	# format
	# ['ZoomIt', '1.15.zip', '2020-08-07', '20:13', '43K', 'ZoomIt', '1.15']
	# figure out what the index of the date is
	# the date is in the format ####-##-##
	date = ''
	name = ''
	iitime = ''
	size = ''
	desc = ''
	for i in range(len(line)):
	# regex for date: ^\d{4}-\d{2}-\d{2}$
	# use regex to check if is part is a date
	if re.match(r'^\d{4}-\d{2}-\d{2}$', line[i]) is not None:
	date = i
	# the strings before the date are the name of the file
	name = ' '.join(line[:date])
	# first string after the date is the itime
	iitime = line[date + 1]
	# second after the date is the size
	size = line[date + 2]
	# the rest is the description
	desc = ' '.join(line[date + 3:]).replace(" info", "")
	# skip if size is -
	if size == "-":
	break
	# add the file to the index
	nurl = section
	if not name.startswith("/"):
	nurl += "/"
	nurl += name

	nurl = nurl.replace('//', '/')
	nurl = nurl.replace('http:/', 'http://')

	if not nurl.endswith("/"):
	index[nurl] = {
	"date": line[date],
	"time": itime,
	"size": size,
	"desc": desc,
	"section": section.replace('//', '/').replace('http:/', 'http://')
	}
	# break out of the for loop
	break

	# write index into pretty json
	with open("index.json", "w") as f:
	json.dump(index, f, indent=4)
	sitemap()