-
-
Save JustAnotherArchivist/b6c15a3dda636c9436f244b2b482720f to your computer and use it in GitHub Desktop.
Debugging ArchiveTeam/wpull#365
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import dns.query | |
import itertools | |
import wpull.errors | |
import wpull.network.dns | |
orig_dns_query_udp = dns.query.udp | |
def udp(message, *args, **kwargs): | |
try: | |
return orig_dns_query_udp(message, *args, **kwargs) | |
except ValueError: | |
print(repr((message,) + args), repr(kwargs)) | |
print(repr(message.to_wire())) | |
raise | |
dns.query.udp = udp | |
resolver = wpull.network.dns.Resolver( | |
family = wpull.network.dns.IPFamilyPreference.ipv4_only, | |
timeout = 30, | |
rotate = None, | |
cache = wpull.network.dns.Resolver.new_cache() | |
) | |
# Explicitly override the nameserver to Google's | |
resolver._dns_resolver.nameservers = ['8.8.8.8'] | |
# Top 100 domains from www.solarpaneltalk.com-inf-20180806-141951-3ac34 | |
# grep -Po 'Fetching ‘https?://\K[^/]+(?=[/’])' wpull.log | awk '{cnt[$1]+=1} END { for (domain in cnt) { print cnt[domain] " " domain }}' | sort -nr | head -100 | awk '{print "hosts.append('"'"'" $2 "'"'"')"}' | |
hosts = [] | |
hosts.append('www.solarpaneltalk.com') | |
hosts.append('static.xx.fbcdn.net') | |
hosts.append('images-na.ssl-images-amazon.com') | |
hosts.append('www.screenlightandgrip.com') | |
hosts.append('www.solarreviews.com') | |
hosts.append('i.ebayimg.com') | |
hosts.append('jonsguide.org') | |
hosts.append('www.facebook.com') | |
hosts.append('www.youtube.com') | |
hosts.append('www.ironridge.com') | |
hosts.append('fonts.gstatic.com') | |
hosts.append('www.amazon.com') | |
hosts.append('energy.sandia.gov') | |
hosts.append('www.solarroofhook.com') | |
hosts.append('en.wikipedia.org') | |
hosts.append('www.homepower.com') | |
hosts.append('www.legis.ga.gov') | |
hosts.append('www.quickmountpv.com') | |
hosts.append('www.solaredge.com') | |
hosts.append('cdn7.bigcommerce.com') | |
hosts.append('www.genstattu.com') | |
hosts.append('www.victronenergy.com') | |
hosts.append('www.outbackpower.com') | |
hosts.append('www.electricgeneratorsdirect.com') | |
hosts.append('www.ebay.com') | |
hosts.append('ir.ebaystatic.com') | |
hosts.append('i.ytimg.com') | |
hosts.append('www.pnas.org') | |
hosts.append('books.google.com.au') | |
hosts.append('www.solar-estimate.org') | |
hosts.append('www.thermotekusa.com') | |
hosts.append('usbattery.com') | |
hosts.append('3ohkdk3zdzcq1dul50oqjvvf-wpengine.netdna-ssl.com') | |
hosts.append('pvoutput.org') | |
hosts.append('www.thisoldhouse.com') | |
hosts.append('www.greenbuildingadvisor.com') | |
hosts.append('www.photonics.com') | |
hosts.append('webosolar.com') | |
hosts.append('fonts.googleapis.com') | |
hosts.append('www.google.com') | |
hosts.append('ssl.gstatic.com') | |
hosts.append('www.fieldlines.com') | |
hosts.append('us.sunpower.com') | |
hosts.append('sepbatteries.com') | |
hosts.append('d114hh0cykhyb0.cloudfront.net') | |
hosts.append('www.caiso.com') | |
hosts.append('i5.walmartimages.com') | |
hosts.append('www.lg.com') | |
hosts.append('www.thesuntrip.com') | |
hosts.append('engineering.stanford.edu') | |
hosts.append('energyandmines.com') | |
hosts.append('2n1s7w3qw84d2ysnx3ia2bct-wpengine.netdna-ssl.com') | |
hosts.append('www.elecdirect.com') | |
hosts.append('s2.ibtimes.com') | |
hosts.append('freebeacon.com') | |
hosts.append('41j5tc3akbrn3uezx5av0jj1bgm-wpengine.netdna-ssl.com') | |
hosts.append('m.media-amazon.com') | |
hosts.append('file.scirp.org') | |
hosts.append('hpevs.com') | |
hosts.append('www.sparelys.no') | |
hosts.append('www.energymatters.com.au') | |
hosts.append('img.dxcdn.com') | |
hosts.append('dkasolarcentre.com.au') | |
hosts.append('d4td1un6f2hha.cloudfront.net') | |
hosts.append('blog.caranddriver.com') | |
hosts.append('www.voltaicsystems.com') | |
hosts.append('www.saftbatteries.com') | |
hosts.append('www.wholesalesolar.com') | |
hosts.append('www.ddmotorsystems.com') | |
hosts.append('batteryuniversity.com') | |
hosts.append('www.googletagmanager.com') | |
hosts.append('static.squarespace.com') | |
hosts.append('www.wbdg.org') | |
hosts.append('thesolarstore.com') | |
hosts.append('static-na.payments-amazon.com') | |
hosts.append('enphase.com') | |
hosts.append('assets.alicdn.com') | |
hosts.append('www.solarquotes.com.au') | |
hosts.append('www.batteriesplus.com') | |
hosts.append('indaily.com.au') | |
hosts.append('ajax.googleapis.com') | |
hosts.append('ae01.alicdn.com') | |
hosts.append('www.eevblog.com') | |
hosts.append('s12.photobucket.com') | |
hosts.append('www.solarpanelstore.com') | |
hosts.append('www.soldapools.com') | |
hosts.append('www.dropbox.com') | |
hosts.append('hybridautocenter.com') | |
hosts.append('www.exeltech.com') | |
hosts.append('www.bluepacificsolar.com') | |
hosts.append('powerequipment.honda.com') | |
hosts.append('gbbattery.com') | |
hosts.append('www.powerequipmentdirect.com') | |
hosts.append('realgoods.com') | |
hosts.append('c5.rgstatic.net') | |
hosts.append('s137.photobucket.com') | |
hosts.append('gcell.com') | |
hosts.append('forum.solar-electric.com') | |
hosts.append('www.seia.org') | |
hosts.append('www.samlexamerica.com') | |
# ... and the two hosts that were fetching at the crash, just to be sure. | |
hosts.append('www.aqua-sun-intl.com') | |
hosts.append('www.katadyn.com') | |
async def main(resolver, hosts): | |
for host in itertools.cycle(hosts): | |
print('Resolving {}'.format(host)) | |
try: | |
await resolver.resolve(host) | |
except wpull.errors.NetworkError: | |
pass | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main(resolver, hosts)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment