-
-
Save dotcomboom/580521247ed24430fb7c75896d7938de to your computer and use it in GitHub Desktop.
goofy internal index generation script thing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this script scrapes w2krepo.somnolescent.net and downloads the indexes of every file into a json listing | |
# | |
# beautifulsoup4 | |
from bs4 import BeautifulSoup | |
# requests | |
import requests | |
# json | |
import json | |
# os | |
import os | |
# sys | |
import sys | |
import re | |
import time | |
def sitemap(): | |
# load index.json into index | |
with open("index.json", "r") as f: | |
index = json.load(f) | |
# write an html sitemap out of the data | |
with open("sitemap.html", "w") as f: | |
f.write("<html><head><title>w2krepo : Index</title></head><body text=""#FFFFFF"" link=""#FFFFFF"" vlink=""#FFFFFF"" bgcolor=""#3A6EA5"" alink=""#C0C0C0"">") | |
# make a table of the index separated by section | |
#UNIQUE list of sections | |
sections = [] | |
for i in index: | |
if index[i]["section"] not in sections: | |
sections.append(index[i]["section"]) | |
# sort | |
sections.sort() | |
print(sections) | |
f.write(""" | |
<font face="Trebuchet MS" size="1"> | |
<a href="/" target="_blank"><b>w2krepo</b></a> | |
<a href="./" target="_top">Exit frames</a> | |
<a href="/Submissions">Contributions</a> | |
</font> | |
<font face="Trebuchet MS"> | |
""") | |
# write table of contents | |
f.write("<ul>") | |
for i in sections: | |
i = i.replace('http://w2krepo.somnolescent.net/', '') | |
f.write("<li><a href='#" + i + "'>" + i.replace('%20', ' ') + "</a></li>") | |
f.write("</ul>") | |
f.write("<p><a href=""https://gist.github.com/dotcomboom/580521247ed24430fb7c75896d7938de"">Generated</a> at {0}</p>".format(time.strftime("%Y-%m-%d %H:%M:%S"))) | |
# make a table | |
for section in sections: | |
f.write("<h2>" + section.replace('http://w2krepo.somnolescent.net/', '').replace('%20', ' ') + "<a name=" + section.replace('http://w2krepo.somnolescent.net/', '') + "></a></h2>") | |
f.write("<table>") | |
f.write("<tr><th>File</th><th>Date</th><th>Time</th><th>Size</th><th>Description</th></tr>") | |
for file in sorted(index.keys()): | |
if index[file]["section"] == section: | |
f.write("<tr>") | |
f.write("<td><a href=\"" + file.replace('http://w2krepo.somnolescent.net/', '/') + "\">" + file.replace(section + '/', '').replace(section, '') + "</a></td>") | |
f.write("<td>" + index[file]["date"] + "</td>") | |
f.write("<td>" + index[file]["time"] + "</td>") | |
f.write("<td>" + index[file]["size"] + "</td>") | |
f.write("<td>" + index[file]["desc"] + "</td>") | |
f.write("</tr>") | |
f.write("</table>") | |
# f.write("<ul>") | |
# for key in index: | |
# #try: | |
# f.write("<li><a href='" + key + "'>" + index[key]['section'] + ": " + key + " (" + index[key]['desc'] + ")" + "</a></li>") | |
# ##except KeyError: | |
# # print('keyerr: ' + key) | |
# f.write("</ul>") | |
f.write("</font></body></html>") | |
gopher() | |
def gopher(): | |
with open("index.json", "r") as f: | |
index = json.load(f) | |
# writes a txt file with gopher markup | |
# each section | |
with open("gopherstyle.txt", "w") as f: | |
# each section, have a little heading and then a list of files | |
#UNIQUE list of sections | |
sections = [] | |
for i in index: | |
if index[i]["section"] not in sections: | |
sections.append(index[i]["section"]) | |
# sort | |
sections.sort() | |
print(sections) | |
for section in sections: | |
f.write("1<< Back to gopher.somnolescent.net\t/\n") | |
f.write("1< Back to w2krepo\t/w2krepo\n") | |
f.write('w2krepo on Gopher\n') | |
friendlysection = section.replace('http://w2krepo.somnolescent.net/', '').replace('%20', ' ') | |
f.write(friendlysection + '\n') | |
f.write('--------------------------------------------------------------\n') | |
# http-only file store | |
http = {} | |
# write files in that section | |
for file in sorted(index.keys()): | |
if index[file]["section"] == section: | |
# type | |
type = '9' | |
# if file ends in zip it is 5 | |
if file.endswith('.zip'): | |
type = '5' | |
if 'M' in index[file]["size"] and float(index[file]["size"].replace('M', '')) > 40: | |
http[file] = (index[file]) | |
else: | |
# check if there's no description and if so, use the filename | |
if index[file]["desc"] == "": | |
f.write(type + file.replace(section, '').replace('%20', ' ') + '\t\n') | |
else: | |
f.write(type + index[file]["desc"] + '\t' + file.replace(section + '/', '').replace(section, '') + '\n') | |
# write http-only files | |
if len(http) > 0: | |
f.write('\nFiles in this repo available over HTTP only:\n') | |
for file in http.keys(): | |
if http[file]["desc"] == "": | |
http[file]["desc"] = file.replace(section, '').replace('%20', ' ') | |
f.write('h' + http[file]["desc"] + '\tURL:' + file + '\n') | |
f.write('\n') | |
f.write('\n') | |
if input('do you want to just gen the sitemap') == 'y': | |
sitemap() | |
sys.exit() | |
# categories url | |
url = "http://w2krepo.somnolescent.net/categories.html" | |
# get all the sections they are links | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, "html.parser") | |
sections = soup.find_all("a") | |
# extract the links | |
sections = [x.get("href") for x in sections] | |
# remove start.html | |
sections = [x for x in sections if x != "start.html"] | |
# remove http://w2krepo.somnolescent.net/Submissions | |
sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/Submissions"] | |
# add http://w2krepo.somnolescent.net/Submissions/Queue | |
#sections.append("http://w2krepo.somnolescent.net/Submissions/Queue") | |
sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/changelog.txt"] | |
sections = [x for x in sections if x != "http://w2krepo.somnolescent.net/sitemap.html"] | |
print(sections) | |
# quickly scrape each to see if there are more directories | |
for section in sections: | |
print('checking ' + section + ' for more directories') | |
r = requests.get(section) | |
soup = BeautifulSoup(r.text, "html.parser") | |
links = soup.find_all("a") | |
for link in links: | |
if link.get("href").endswith("/") and not link.get("href") == "./" and not link.get("href") == "/" and not link.get("href").startswith('https://') and not link.get('href').startswith("http://") and not link.get("href").startswith('/'): | |
nurl = section | |
if not link.get('href').startswith("/"): | |
nurl += "/" | |
nurl += link.get("href").replace('//', '/') | |
sections.append(nurl) | |
index = {} | |
# crawl the directories | |
for section in sections: | |
print(section) | |
r = requests.get(section) | |
soup = BeautifulSoup(r.text, "html.parser") | |
# get pre | |
pre = soup.find("pre") | |
# get the line with each link and print it | |
for line in pre.text.split("\n"): | |
line = line.split() | |
# format | |
# ['ZoomIt', '1.15.zip', '2020-08-07', '20:13', '43K', 'ZoomIt', '1.15'] | |
# figure out what the index of the date is | |
# the date is in the format ####-##-## | |
date = '' | |
name = '' | |
iitime = '' | |
size = '' | |
desc = '' | |
for i in range(len(line)): | |
# regex for date: ^\d{4}-\d{2}-\d{2}$ | |
# use regex to check if is part is a date | |
if re.match(r'^\d{4}-\d{2}-\d{2}$', line[i]) is not None: | |
date = i | |
# the strings before the date are the name of the file | |
name = ' '.join(line[:date]) | |
# first string after the date is the itime | |
iitime = line[date + 1] | |
# second after the date is the size | |
size = line[date + 2] | |
# the rest is the description | |
desc = ' '.join(line[date + 3:]).replace(" info", "") | |
# skip if size is - | |
if size == "-": | |
break | |
# add the file to the index | |
nurl = section | |
if not name.startswith("/"): | |
nurl += "/" | |
nurl += name | |
nurl = nurl.replace('//', '/') | |
nurl = nurl.replace('http:/', 'http://') | |
if not nurl.endswith("/"): | |
index[nurl] = { | |
"date": line[date], | |
"time": itime, | |
"size": size, | |
"desc": desc, | |
"section": section.replace('//', '/').replace('http:/', 'http://') | |
} | |
# break out of the for loop | |
break | |
# write index into pretty json | |
with open("index.json", "w") as f: | |
json.dump(index, f, indent=4) | |
sitemap() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment