Skip to content

Instantly share code, notes, and snippets.

@nsapa
Last active August 27, 2020 22:34
Show Gist options
  • Save nsapa/078e12acf5648fde11efbe8fd707e2ea to your computer and use it in GitHub Desktop.
Save nsapa/078e12acf5648fde11efbe8fd707e2ea to your computer and use it in GitHub Desktop.
Patch on WikiTeam/wikiteam for grabbing https://wiki.dystify.com
diff --git a/dumpgenerator.py b/dumpgenerator.py
index 3193fe2..e09cd68 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -497,8 +497,10 @@ def getUserAgent():
""" Return a cool user-agent to hide Python user-agent """
useragents = [
# firefox
- 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
- 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
+ #'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
+ #'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
+ 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0',
+ 'Mozilla/5.0 (Windows NT 3.51) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
]
return useragents[0]
@@ -574,6 +576,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
except requests.exceptions.ConnectionError as e:
print ' Connection error: %s'%(str(e[0]))
xml = ''
+ except requests.exceptions.ReadTimeout as e:
+ print ' Read timeout: %s'%(str(e[0]))
+ xml = ''
c += 1
return xml
@@ -1471,16 +1476,30 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
print 'Filename is too long, truncating. Now it is:', filename2
filename3 = u'%s/%s' % (imagepath, filename2)
imagefile = open(filename3, 'wb')
- r = requests.get(url=url)
+
+ # HACK HACK for wiki.dystify.com
+ url = url.replace('http:','https:')
+
+ r = session.head(url=url, allow_redirects=False)
+ if r.is_redirect:
+ print 'Site is redirecting us to: ', r.url
+ url = r.url
+ print 'Final URL image', url
+ r = session.get(url=url, allow_redirects=False)
+ if re.search(r'Not Acceptable', r.content):
+ print 'Server refused to send us content'
+ sys.exit()
imagefile.write(r.content)
imagefile.close()
# saving description if any
try:
title = u'Image:%s' % (filename)
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
+ print 'Using action=query to export title: ', title
r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
xmlfiledesc = r.text
else:
+ print 'Using getXMLFileDesc() to export title: ', title
xmlfiledesc = getXMLFileDesc(
config=config,
title=title,
@@ -1494,7 +1513,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
f = open('%s/%s.desc' % (imagepath, filename2), 'w')
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
- if not re.search(r'</mediawiki>', xmlfiledesc):
+ if not re.search(r'</page>', xmlfiledesc):
# failure when retrieving desc? then save it as empty .desc
xmlfiledesc = ''
f.write(xmlfiledesc.encode('utf-8'))
@@ -1976,7 +1995,9 @@ def checkAPI(api=None, session=None):
def checkIndex(index=None, cookies=None, session=None):
""" Checking index.php availability """
+ print 'Index URL: ', index
r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
+ #print r.text
if r.status_code >= 400:
print("ERROR: The wiki returned status code HTTP {}".format(r.status_code))
return False
diff --git a/launcher.py b/launcher.py
index 708635a..e7b4562 100644
--- a/launcher.py
+++ b/launcher.py
@@ -76,15 +76,15 @@ def main():
started = True
break #stop searching, dot not explore subdirectories
- # time.sleep(60)
+ time.sleep(60)
# Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
# typically they don't provide any crawl-delay value in their robots.txt).
if started and wikidir: #then resume
print 'Resuming download, using directory', wikidir
- subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
+ subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--delay=10', '--retries=10', '--path={}'.format(wikidir)], shell=False)
else: #download from scratch
- subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
+ subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--delay=10', '--retries=10'], shell=False)
started = True
#save wikidir now
for f in os.listdir('.'):
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment