Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created March 13, 2022 05:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/9bde51b98983bad0d4d738eed0a4d82f to your computer and use it in GitHub Desktop.
Save thisismattmiller/9bde51b98983bad0d4d738eed0a4d82f to your computer and use it in GitHub Desktop.
import glob
import json
urls = {}
for file in glob.glob('data_sogb/*'):
with open(file) as inf:
for line in inf:
j = json.loads('{' + line.split('{')[1])
url = j['url'].replace('http://','').replace('https://','').split('/')[0]
urls[url] = True
print(json.dumps(list(urls.keys()),indent=2))
import requests
common_crawls = ["CC-MAIN-2022-05","CC-MAIN-2021-49","CC-MAIN-2021-43","CC-MAIN-2021-39","CC-MAIN-2021-31","CC-MAIN-2021-25","CC-MAIN-2021-21","CC-MAIN-2021-17","CC-MAIN-2021-10","CC-MAIN-2021-04","CC-MAIN-2020-50","CC-MAIN-2020-45","CC-MAIN-2020-40","CC-MAIN-2020-34","CC-MAIN-2020-29","CC-MAIN-2020-24","CC-MAIN-2020-16","CC-MAIN-2020-10","CC-MAIN-2020-05","CC-MAIN-2019-51","CC-MAIN-2019-47","CC-MAIN-2019-43","CC-MAIN-2019-39","CC-MAIN-2019-35","CC-MAIN-2019-30","CC-MAIN-2019-26","CC-MAIN-2019-22","CC-MAIN-2019-18","CC-MAIN-2019-13","CC-MAIN-2019-09","CC-MAIN-2019-04","CC-MAIN-2018-51","CC-MAIN-2018-47","CC-MAIN-2018-43","CC-MAIN-2018-39","CC-MAIN-2018-34","CC-MAIN-2018-30","CC-MAIN-2018-26","CC-MAIN-2018-22","CC-MAIN-2018-17","CC-MAIN-2018-13","CC-MAIN-2018-09","CC-MAIN-2018-05","CC-MAIN-2017-51","CC-MAIN-2017-47","CC-MAIN-2017-43","CC-MAIN-2017-39","CC-MAIN-2017-34","CC-MAIN-2017-30","CC-MAIN-2017-26","CC-MAIN-2017-22","CC-MAIN-2017-17","CC-MAIN-2017-13","CC-MAIN-2017-09","CC-MAIN-2017-04","CC-MAIN-2016-50","CC-MAIN-2016-44","CC-MAIN-2016-40","CC-MAIN-2016-36","CC-MAIN-2016-30","CC-MAIN-2016-26","CC-MAIN-2016-22","CC-MAIN-2016-18","CC-MAIN-2016-07","CC-MAIN-2015-48","CC-MAIN-2015-40","CC-MAIN-2015-35","CC-MAIN-2015-32","CC-MAIN-2015-27","CC-MAIN-2015-22","CC-MAIN-2015-18","CC-MAIN-2015-14","CC-MAIN-2015-11","CC-MAIN-2015-06","CC-MAIN-2014-52","CC-MAIN-2014-49","CC-MAIN-2014-42","CC-MAIN-2014-41","CC-MAIN-2014-35","CC-MAIN-2014-23","CC-MAIN-2014-15","CC-MAIN-2014-10","CC-MAIN-2013-48","CC-MAIN-2013-20"]
for cc in common_crawls:
url = f"http://index.commoncrawl.org/{cc}-index?url=*.signourguestbook.com"
req = requests.get(url)
print(url)
if req.status_code == 200:
with open(f"data_sogb/{cc}",'w') as outf:
outf.write(req.text)
[
"1cda.signourguestbook.com",
"31act6668.signourguestbook.com",
"78thinfantrydiv.signourguestbook.com",
"abbeyjim.signourguestbook.com",
"abrev.signourguestbook.com",
"alrdoc.signourguestbook.com",
"angelwispa.signourguestbook.com",
"arrowoodbrian.signourguestbook.com",
"august.signourguestbook.com",
"authorcelia.signourguestbook.com",
"authoritarians.signourguestbook.com",
"authorkeqm.signourguestbook.com",
"avhsalumni.signourguestbook.com",
"bravedork.signourguestbook.com",
"bronze4u.signourguestbook.com",
"browngap.signourguestbook.com",
"bryanel.signourguestbook.com",
"catwholaughed.signourguestbook.com",
"chocal8kiss.signourguestbook.com",
"chrishill.signourguestbook.com",
"cigarmanandy.signourguestbook.com",
"cottonbalers.signourguestbook.com",
"cromer.signourguestbook.com",
"cubit99.signourguestbook.com",
"cybersulat.signourguestbook.com",
"deadmiledance.signourguestbook.com",
"deg.signourguestbook.com",
"dejavu48.signourguestbook.com",
"deyakusuma.signourguestbook.com",
"diocesesd.signourguestbook.com",
"diypoll.signourguestbook.com",
"djjayito.signourguestbook.com",
"dmc.signourguestbook.com",
"dodygood.signourguestbook.com",
"dragoni.signourguestbook.com",
"edinburg.signourguestbook.com",
"ellenmeister.signourguestbook.com",
"etnoyen.signourguestbook.com",
"fgfservices.signourguestbook.com",
"flipper0828.signourguestbook.com",
"flipper082859.signourguestbook.com",
"flipper828.signourguestbook.com",
"founder.signourguestbook.com",
"frankie66.signourguestbook.com",
"frobert.signourguestbook.com",
"froogle.signourguestbook.com",
"fujiprofessional.signourguestbook.com",
"funn2009.signourguestbook.com",
"gbstalag.signourguestbook.com",
"gearle123.signourguestbook.com",
"genejones.signourguestbook.com",
"gilgerarddotcom.signourguestbook.com",
"gmo2010.signourguestbook.com",
"goatlocker.signourguestbook.com",
"gr8danelover.signourguestbook.com",
"gregoryabbott.signourguestbook.com",
"gwmcrae.signourguestbook.com",
"www.hellsangelsberdoo.signourguestbook.com",
"homeagainfarm.signourguestbook.com",
"hsjeguestbook.signourguestbook.com",
"ifp.signourguestbook.com",
"iranianfootballpage.signourguestbook.com",
"iuecone.signourguestbook.com",
"janapood.signourguestbook.com",
"jeremycallaghanfansite.signourguestbook.com",
"jimjammer1.signourguestbook.com",
"jiyushinkai.signourguestbook.com",
"johnmorganhappyhour.signourguestbook.com",
"judygarland7.signourguestbook.com",
"k6ge.signourguestbook.com",
"kathytemean.signourguestbook.com",
"khesanhvet.signourguestbook.com",
"legends.signourguestbook.com",
"limahl.signourguestbook.com",
"linedancer.signourguestbook.com",
"louisetaylor.signourguestbook.com",
"lukabal.signourguestbook.com",
"majesticworld.signourguestbook.com",
"marilynsorensen.signourguestbook.com",
"mdrumm.signourguestbook.com",
"middlegeorgiaparanormal.signourguestbook.com",
"mysteryfeet.signourguestbook.com",
"nashwaaksis.signourguestbook.com",
"neatoday-danvers.signourguestbook.com",
"neatoday-reno.signourguestbook.com",
"neshkov.signourguestbook.com",
"netwish.signourguestbook.com",
"ngravley.signourguestbook.com",
"nicko62.signourguestbook.com",
"nverona.signourguestbook.com",
"onepeople.signourguestbook.com",
"onsight.signourguestbook.com",
"pannudds.signourguestbook.com",
"patjamesguestbook.signourguestbook.com",
"rheidt.signourguestbook.com",
"richards.signourguestbook.com",
"rorocny.signourguestbook.com",
"rprather.signourguestbook.com",
"ruralwillys.signourguestbook.com",
"scbbbc.signourguestbook.com",
"segitseganglia.signourguestbook.com",
"selectee.signourguestbook.com",
"shahid74.signourguestbook.com",
"spiderlakeretreat.signourguestbook.com",
"strwynd.signourguestbook.com",
"sweetblues.signourguestbook.com",
"teahouse.signourguestbook.com",
"thmch.signourguestbook.com",
"tigerforcerecon.signourguestbook.com",
"tsrl.signourguestbook.com",
"undertheblades.signourguestbook.com",
"usscanopus.signourguestbook.com",
"ussfranklindroosevelt.signourguestbook.com",
"usshollandas32.signourguestbook.com",
"ussstr.signourguestbook.com",
"vavau.signourguestbook.com",
"vines4u.signourguestbook.com",
"w5dxs.signourguestbook.com",
"webmasteratbigt.signourguestbook.com",
"wellofstars.signourguestbook.com",
"westburyfd.signourguestbook.com",
"westmauivacation.signourguestbook.com",
"yerusha.signourguestbook.com",
"signourguestbook.com",
"a2p.signourguestbook.com",
"aaronforever.signourguestbook.com",
"agenesiscorpuscallosum.signourguestbook.com",
"akphoto7.signourguestbook.com",
"auban.signourguestbook.com",
"biggles.signourguestbook.com",
"blackpoolweddings.signourguestbook.com",
"bobkunnel.signourguestbook.com",
"cavaliers.signourguestbook.com",
"classicalmysterytour.signourguestbook.com",
"cottages.signourguestbook.com",
"crandr.signourguestbook.com",
"davidlharrison.signourguestbook.com",
"dominique.signourguestbook.com",
"dorgalli.signourguestbook.com",
"dorgalli2.signourguestbook.com",
"e2dennis.signourguestbook.com",
"earles.signourguestbook.com",
"exlancs.signourguestbook.com",
"fgfprojects.signourguestbook.com",
"hotclub.signourguestbook.com",
"ileategbe.signourguestbook.com",
"jansmurph.signourguestbook.com",
"jarhead9962.signourguestbook.com",
"jarhead9962-2.signourguestbook.com",
"jessicahaffer.signourguestbook.com",
"jhatch.signourguestbook.com",
"jlgage01.signourguestbook.com",
"journeyguestbook.signourguestbook.com",
"judyarnold.signourguestbook.com",
"keeney.signourguestbook.com",
"kiaorana.signourguestbook.com",
"klaxtonbrown.signourguestbook.com",
"larryebailey.signourguestbook.com",
"luishiggins.signourguestbook.com",
"minty.signourguestbook.com",
"mk74scott.signourguestbook.com",
"ngsir.signourguestbook.com",
"nickmessinger.signourguestbook.com",
"patj5338.signourguestbook.com",
"president.signourguestbook.com",
"pwhsa.signourguestbook.com",
"qstogether.signourguestbook.com",
"rietomosaka.signourguestbook.com",
"rinksgal.signourguestbook.com",
"seinebight.signourguestbook.com",
"shadyacresfl.signourguestbook.com",
"skkky.signourguestbook.com",
"sobs.signourguestbook.com",
"southerncpafirm.signourguestbook.com",
"southernsteel.signourguestbook.com",
"speculumgregis.signourguestbook.com",
"sweetwaterbranch.signourguestbook.com",
"tghhampton.signourguestbook.com",
"thomaswikman.signourguestbook.com",
"truthministries.signourguestbook.com",
"ukzorro.signourguestbook.com",
"6987th.signourguestbook.com",
"brasseauxskennel.signourguestbook.com",
"dccc.signourguestbook.com",
"denicefranke.signourguestbook.com",
"dkranig.signourguestbook.com",
"elanamusic.signourguestbook.com",
"fairwayclay.signourguestbook.com",
"gtisc.signourguestbook.com",
"junglerosebeauty.signourguestbook.com",
"marywel.signourguestbook.com",
"mrp.signourguestbook.com",
"oliviagracearmand.signourguestbook.com",
"oshws.signourguestbook.com",
"raff.signourguestbook.com",
"robinson.signourguestbook.com",
"twest1117.signourguestbook.com",
"christiansurvivors.signourguestbook.com",
"cvt.signourguestbook.com",
"demonknightsmc.signourguestbook.com",
"dmccunn.signourguestbook.com",
"econ.signourguestbook.com",
"feedback.signourguestbook.com",
"fll.signourguestbook.com",
"heirloomjewelryandcoins.signourguestbook.com",
"highla12.signourguestbook.com",
"jerrywmcdaniel.signourguestbook.com",
"jmaclean.signourguestbook.com",
"km811.signourguestbook.com",
"lburg.signourguestbook.com",
"plantters.signourguestbook.com",
"renobailey.signourguestbook.com",
"steveweed.signourguestbook.com",
"stirl1.signourguestbook.com",
"tymarshal.signourguestbook.com",
"whsa.signourguestbook.com",
"mfwc.signourguestbook.com",
"pioneers.signourguestbook.com",
"roughneckcity.signourguestbook.com",
"aceofcups.signourguestbook.com",
"anthonybooty.signourguestbook.com",
"campnaire.signourguestbook.com",
"carolann.signourguestbook.com",
"cvmassage.signourguestbook.com",
"epmcentral.signourguestbook.com",
"haysjj.signourguestbook.com",
"ife.signourguestbook.com",
"kaeo.signourguestbook.com",
"kathih.signourguestbook.com",
"malites.signourguestbook.com",
"margaritabiz.signourguestbook.com",
"microflyers.signourguestbook.com",
"myjeblog.signourguestbook.com",
"rivhunter.signourguestbook.com",
"rosary.signourguestbook.com",
"sandsoyveret.signourguestbook.com",
"usstroutss566.signourguestbook.com",
"amazinglyawesome.signourguestbook.com",
"hoodrelatedentertainment.signourguestbook.com",
"sgforums.signourguestbook.com",
"smile.signourguestbook.com",
"yellowstone.signourguestbook.com",
"yourportchestersnapshot.signourguestbook.com",
"chayzlounge.signourguestbook.com",
"cjohnson.signourguestbook.com",
"pcdl.signourguestbook.com",
"rthompson.signourguestbook.com",
"waymartlodge.signourguestbook.com",
"ladyphenie.signourguestbook.com",
"hermajesty.signourguestbook.com",
"otkws1.signourguestbook.com",
"stmarkshighinfo.signourguestbook.com",
"vuguest.signourguestbook.com",
"wimpolepast.signourguestbook.com",
"rakshi.signourguestbook.com",
"watzegtdebijbel.signourguestbook.com",
"wimbledonosteopathy.signourguestbook.com",
"320thwebmaster.signourguestbook.com",
"joozis.signourguestbook.com",
"jwhc.signourguestbook.com",
"customcakes.signourguestbook.com",
"newhopepennsylvania.signourguestbook.com",
"txpride.signourguestbook.com",
"grannyrocks.signourguestbook.com",
"nekmar.signourguestbook.com",
"tonytdm.signourguestbook.com",
"usvisamex.signourguestbook.com",
"bcomfetish.signourguestbook.com",
"ginnydeer.signourguestbook.com",
"girulinda.signourguestbook.com",
"jay901rana.signourguestbook.com",
"judel.signourguestbook.com",
"madjingaye.signourguestbook.com",
"marine-staff-sergeant.signourguestbook.com",
"pypes.signourguestbook.com",
"tellurideairport.signourguestbook.com",
"woolybear.signourguestbook.com",
"mamaslittlecatering.signourguestbook.com",
"starfish.signourguestbook.com",
"ngsinlin.signourguestbook.com",
"hleaglesnewssite.signourguestbook.com",
"kathykathy.signourguestbook.com",
"leahsjourney.signourguestbook.com",
"sorsogonunited.signourguestbook.com",
"vvmc.signourguestbook.com",
"padrecataag.signourguestbook.com",
"sagebrush-cantina.signourguestbook.com",
"donscottcourt.signourguestbook.com",
"dvc.signourguestbook.com",
"travelingjournalis.signourguestbook.com",
"standby5.signourguestbook.com",
"sedonakat.signourguestbook.com",
"masticmaster.signourguestbook.com",
"sauceboss.signourguestbook.com",
"soc63.signourguestbook.com",
"desipandit.signourguestbook.com",
"hethaifa.signourguestbook.com",
"sola24.signourguestbook.com"
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment