nsapa/wikiteam_dystify_hacks.diff

## wikiteam_dystify_hacks.diff
diff --git a/dumpgenerator.py b/dumpgenerator.py
index 3193fe2..e09cd68 100755
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -497,8 +497,10 @@ def getUserAgent():
     """ Return a cool user-agent to hide Python user-agent """
     useragents = [
         # firefox
-        'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
-        'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
+        #'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
+        #'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
+        'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0',
+        'Mozilla/5.0 (Windows NT 3.51) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
     ]
     return useragents[0]

@@ -574,6 +576,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
         except requests.exceptions.ConnectionError as e:
             print '    Connection error: %s'%(str(e[0]))
             xml = ''
+        except requests.exceptions.ReadTimeout as e:
+            print '    Read timeout: %s'%(str(e[0]))
+            xml = ''
         c += 1

     return xml
@@ -1471,16 +1476,30 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
             print 'Filename is too long, truncating. Now it is:', filename2
         filename3 = u'%s/%s' % (imagepath, filename2)
         imagefile = open(filename3, 'wb')
-        r = requests.get(url=url)
+
+        # HACK HACK for wiki.dystify.com
+        url = url.replace('http:','https:')
+
+        r = session.head(url=url, allow_redirects=False)
+        if r.is_redirect:
+            print 'Site is redirecting us to: ', r.url
+            url = r.url
+        print 'Final URL image', url
+        r = session.get(url=url, allow_redirects=False)
+        if re.search(r'Not Acceptable', r.content):
+            print 'Server refused to send us content'
+            sys.exit()
         imagefile.write(r.content)
         imagefile.close()
         # saving description if any
         try:
             title = u'Image:%s' % (filename)
             if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
+                print 'Using action=query to export title: ', title
                 r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
                 xmlfiledesc = r.text
             else:
+                print 'Using getXMLFileDesc() to export title: ', title
                 xmlfiledesc = getXMLFileDesc(
                     config=config,
                     title=title,
@@ -1494,7 +1513,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):

         f = open('%s/%s.desc' % (imagepath, filename2), 'w')
         # <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
-        if not re.search(r'</mediawiki>', xmlfiledesc):
+        if not re.search(r'</page>', xmlfiledesc):
             # failure when retrieving desc? then save it as empty .desc
             xmlfiledesc = ''
         f.write(xmlfiledesc.encode('utf-8'))
@@ -1976,7 +1995,9 @@ def checkAPI(api=None, session=None):

 def checkIndex(index=None, cookies=None, session=None):
     """ Checking index.php availability """
+    print 'Index URL: ', index
     r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
+    #print r.text
     if r.status_code >= 400:
         print("ERROR: The wiki returned status code HTTP {}".format(r.status_code))
         return False
diff --git a/launcher.py b/launcher.py
index 708635a..e7b4562 100644
--- a/launcher.py
+++ b/launcher.py
@@ -76,15 +76,15 @@ def main():
                 started = True
                 break #stop searching, dot not explore subdirectories

-        # time.sleep(60)
+        time.sleep(60)
         # Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
         # such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
         # typically they don't provide any crawl-delay value in their robots.txt).
         if started and wikidir: #then resume
             print 'Resuming download, using directory', wikidir
-            subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
+            subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--delay=10', '--retries=10',  '--path={}'.format(wikidir)], shell=False)
         else: #download from scratch
-            subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
+            subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--delay=10', '--retries=10'], shell=False)
             started = True
             #save wikidir now
             for f in os.listdir('.'):
	diff --git a/dumpgenerator.py b/dumpgenerator.py
	index 3193fe2..e09cd68 100755
	--- a/dumpgenerator.py
	+++ b/dumpgenerator.py
	@@ -497,8 +497,10 @@ def getUserAgent():
	""" Return a cool user-agent to hide Python user-agent """
	useragents = [
	# firefox
	- 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
	- 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
	+ #'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
	+ #'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
	+ 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0',
	+ 'Mozilla/5.0 (Windows NT 3.51) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
	]
	return useragents[0]

	@@ -574,6 +576,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
	except requests.exceptions.ConnectionError as e:
	print ' Connection error: %s'%(str(e[0]))
	xml = ''
	+ except requests.exceptions.ReadTimeout as e:
	+ print ' Read timeout: %s'%(str(e[0]))
	+ xml = ''
	c += 1

	return xml
	@@ -1471,16 +1476,30 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
	print 'Filename is too long, truncating. Now it is:', filename2
	filename3 = u'%s/%s' % (imagepath, filename2)
	imagefile = open(filename3, 'wb')
	- r = requests.get(url=url)
	+
	+ # HACK HACK for wiki.dystify.com
	+ url = url.replace('http:','https:')
	+
	+ r = session.head(url=url, allow_redirects=False)
	+ if r.is_redirect:
	+ print 'Site is redirecting us to: ', r.url
	+ url = r.url
	+ print 'Final URL image', url
	+ r = session.get(url=url, allow_redirects=False)
	+ if re.search(r'Not Acceptable', r.content):
	+ print 'Server refused to send us content'
	+ sys.exit()
	imagefile.write(r.content)
	imagefile.close()
	# saving description if any
	try:
	title = u'Image:%s' % (filename)
	if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
	+ print 'Using action=query to export title: ', title
	r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
	xmlfiledesc = r.text
	else:
	+ print 'Using getXMLFileDesc() to export title: ', title
	xmlfiledesc = getXMLFileDesc(
	config=config,
	title=title,
	@@ -1494,7 +1513,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):

	f = open('%s/%s.desc' % (imagepath, filename2), 'w')
	# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
	- if not re.search(r'</mediawiki>', xmlfiledesc):
	+ if not re.search(r'</page>', xmlfiledesc):
	# failure when retrieving desc? then save it as empty .desc
	xmlfiledesc = ''
	f.write(xmlfiledesc.encode('utf-8'))
	@@ -1976,7 +1995,9 @@ def checkAPI(api=None, session=None):

	def checkIndex(index=None, cookies=None, session=None):
	""" Checking index.php availability """
	+ print 'Index URL: ', index
	r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
	+ #print r.text
	if r.status_code >= 400:
	print("ERROR: The wiki returned status code HTTP {}".format(r.status_code))
	return False
	diff --git a/launcher.py b/launcher.py
	index 708635a..e7b4562 100644
	--- a/launcher.py
	+++ b/launcher.py
	@@ -76,15 +76,15 @@ def main():
	started = True
	break #stop searching, dot not explore subdirectories

	- # time.sleep(60)
	+ time.sleep(60)
	# Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
	# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
	# typically they don't provide any crawl-delay value in their robots.txt).
	if started and wikidir: #then resume
	print 'Resuming download, using directory', wikidir
	- subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
	+ subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--delay=10', '--retries=10', '--path={}'.format(wikidir)], shell=False)
	else: #download from scratch
	- subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
	+ subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--delay=10', '--retries=10'], shell=False)
	started = True
	#save wikidir now
	for f in os.listdir('.'):