Skip to content

Instantly share code, notes, and snippets.

@dotcomboom
Last active March 8, 2022 01:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dotcomboom/580521247ed24430fb7c75896d7938de to your computer and use it in GitHub Desktop.
Save dotcomboom/580521247ed24430fb7c75896d7938de to your computer and use it in GitHub Desktop.
goofy internal index generation script thing
# this script scrapes w2krepo.somnolescent.net and downloads the indexes of every file into a json listing
#
# beautifulsoup4
from bs4 import BeautifulSoup
# requests
import requests
# json
import json
# os
import os
# sys
import sys
import re
import time
def sitemap():
# load index.json into index
with open("index.json", "r") as f:
index = json.load(f)
# write an html sitemap out of the data
with open("sitemap.html", "w") as f:
f.write("<html><head><title>w2krepo : Index</title></head><body text=""#FFFFFF"" link=""#FFFFFF"" vlink=""#FFFFFF"" bgcolor=""#3A6EA5"" alink=""#C0C0C0"">")
# make a table of the index separated by section
#UNIQUE list of sections
sections = []
for i in index:
if index[i]["section"] not in sections:
sections.append(index[i]["section"])
# sort
sections.sort()
print(sections)
f.write("""
<font face="Trebuchet MS" size="1">
<a href="/" target="_blank"><b>w2krepo</b></a>&nbsp;&nbsp;
<a href="./" target="_top">Exit frames</a>&nbsp;&nbsp;
<a href="/Submissions">Contributions</a>
</font>
<font face="Trebuchet MS">
""")
# write table of contents
f.write("<ul>")
for i in sections:
i = i.replace('http://w2krepo.somnolescent.net/', '')
f.write("<li><a href='#" + i + "'>" + i.replace('%20', ' ') + "</a></li>")
f.write("</ul>")
f.write("<p><a href=""https://gist.github.com/dotcomboom/580521247ed24430fb7c75896d7938de"">Generated</a> at {0}</p>".format(time.strftime("%Y-%m-%d %H:%M:%S")))
# make a table
for section in sections:
f.write("<h2>" + section.replace('http://w2krepo.somnolescent.net/', '').replace('%20', ' ') + "<a name=" + section.replace('http://w2krepo.somnolescent.net/', '') + "></a></h2>")
f.write("<table>")
f.write("<tr><th>File</th><th>Date</th><th>Time</th><th>Size</th><th>Description</th></tr>")
for file in sorted(index.keys()):
if index[file]["section"] == section:
f.write("<tr>")
f.write("<td><a href=\"" + file.replace('http://w2krepo.somnolescent.net/', '/') + "\">" + file.replace(section + '/', '').replace(section, '') + "</a></td>")
f.write("<td>" + index[file]["date"] + "</td>")
f.write("<td>" + index[file]["time"] + "</td>")
f.write("<td>" + index[file]["size"] + "</td>")
f.write("<td>" + index[file]["desc"] + "</td>")
f.write("</tr>")
f.write("</table>")
# f.write("<ul>")
# for key in index:
# #try:
# f.write("<li><a href='" + key + "'>" + index[key]['section'] + ": " + key + " (" + index[key]['desc'] + ")" + "</a></li>")
# ##except KeyError:
# # print('keyerr: ' + key)
# f.write("</ul>")
f.write("</font></body></html>")
gopher()
def gopher():
with open("index.json", "r") as f:
index = json.load(f)
# writes a txt file with gopher markup
# each section
with open("gopherstyle.txt", "w") as f:
# each section, have a little heading and then a list of files
#UNIQUE list of sections
sections = []
for i in index:
if index[i]["section"] not in sections:
sections.append(index[i]["section"])
# sort
sections.sort()
print(sections)
for section in sections:
f.write("1<< Back to gopher.somnolescent.net\t/\n")
f.write("1< Back to w2krepo\t/w2krepo\n")
f.write('w2krepo on Gopher\n')
friendlysection = section.replace('http://w2krepo.somnolescent.net/', '').replace('%20', ' ')
f.write(friendlysection + '\n')
f.write('--------------------------------------------------------------\n')
# http-only file store
http = {}
# write files in that section
for file in sorted(index.keys()):
if index[file]["section"] == section:
# type
type = '9'
# if file ends in zip it is 5
if file.endswith('.zip'):
type = '5'
if 'M' in index[file]["size"] and float(index[file]["size"].replace('M', '')) > 40:
http[file] = (index[file])
else:
# check if there's no description and if so, use the filename
if index[file]["desc"] == "":
f.write(type + file.replace(section, '').replace('%20', ' ') + '\t\n')
else:
f.write(type + index[file]["desc"] + '\t' + file.replace(section + '/', '').replace(section, '') + '\n')
# write http-only files
if len(http) > 0:
f.write('\nFiles in this repo available over HTTP only:\n')
for file in http.keys():
if http[file]["desc"] == "":
http[file]["desc"] = file.replace(section, '').replace('%20', ' ')
f.write('h' + http[file]["desc"] + '\tURL:' + file + '\n')
f.write('\n')
f.write('\n')
if input('do you want to just gen the sitemap') == 'y':
sitemap()
sys.exit()
# categories url
url = "http://w2krepo.somnolescent.net/categories.html"
# get all the sections they are links
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
sections = soup.find_all("a")
# extract the links
sections = [x.get("href") for x in sections]
# remove start.html
sections = [x for x in sections if x != "start.html"]
# remove http://w2krepo.somnolescent.net/Submissions
sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/Submissions"]
# add http://w2krepo.somnolescent.net/Submissions/Queue
#sections.append("http://w2krepo.somnolescent.net/Submissions/Queue")
sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/changelog.txt"]
sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/sitemap.html"]
print(sections)
# quickly scrape each to see if there are more directories
for section in sections:
print('checking ' + section + ' for more directories')
r = requests.get(section)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.find_all("a")
for link in links:
if link.get("href").endswith("/") and not link.get("href") == "./" and not link.get("href") == "/" and not link.get("href").startswith('https://') and not link.get('href').startswith("http://") and not link.get("href").startswith('/'):
nurl = section
if not link.get('href').startswith("/"):
nurl += "/"
nurl += link.get("href").replace('//', '/')
sections.append(nurl)
index = {}
# crawl the directories
for section in sections:
print(section)
r = requests.get(section)
soup = BeautifulSoup(r.text, "html.parser")
# get pre
pre = soup.find("pre")
# get the line with each link and print it
for line in pre.text.split("\n"):
line = line.split()
# format
# ['ZoomIt', '1.15.zip', '2020-08-07', '20:13', '43K', 'ZoomIt', '1.15']
# figure out what the index of the date is
# the date is in the format ####-##-##
date = ''
name = ''
iitime = ''
size = ''
desc = ''
for i in range(len(line)):
# regex for date: ^\d{4}-\d{2}-\d{2}$
# use regex to check if is part is a date
if re.match(r'^\d{4}-\d{2}-\d{2}$', line[i]) is not None:
date = i
# the strings before the date are the name of the file
name = ' '.join(line[:date])
# first string after the date is the itime
iitime = line[date + 1]
# second after the date is the size
size = line[date + 2]
# the rest is the description
desc = ' '.join(line[date + 3:]).replace(" info", "")
# skip if size is -
if size == "-":
break
# add the file to the index
nurl = section
if not name.startswith("/"):
nurl += "/"
nurl += name
nurl = nurl.replace('//', '/')
nurl = nurl.replace('http:/', 'http://')
if not nurl.endswith("/"):
index[nurl] = {
"date": line[date],
"time": itime,
"size": size,
"desc": desc,
"section": section.replace('//', '/').replace('http:/', 'http://')
}
# break out of the for loop
break
# write index into pretty json
with open("index.json", "w") as f:
json.dump(index, f, indent=4)
sitemap()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment