Skip to content

Instantly share code, notes, and snippets.

@mrchipset
Last active November 16, 2023 01:42
Show Gist options
  • Save mrchipset/7deacb2ef01b43ee356ca58c6e85fac1 to your computer and use it in GitHub Desktop.
Save mrchipset/7deacb2ef01b43ee356ca58c6e85fac1 to your computer and use it in GitHub Desktop.
Submit top 10 last modified url to baidu
import logging
import http.client
import json
import sys
from datetime import datetime
from xml.dom.minidom import parseString
import xml.dom.minidom
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
URL = 'www.xtigerkin.com'
TOKEN = '<YOUR API TOKEN>'
logger = logging.getLogger("submit_baiduspider")
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add a handle
file_handler = logging.FileHandler('/var/log/submit_baiduspider.log')
file_handler.setLevel(logging.INFO)
# add a formatter
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
def get_sitemap_xml(url):
logger.info('Try to get sitemap.xml.')
conn = http.client.HTTPSConnection(url)
payload = ''
headers = {
'User-Agent': USER_AGENT
}
conn.request("GET", "/sitemap.xml", payload, headers)
res = conn.getresponse()
if res.status == 200:
data = res.read().decode('utf-8')
return data
else:
return None
def parse_xml(_xml):
"""
Solve nodeValue is none by this post;
https://blog.csdn.net/u011478909/article/details/52103529
"""
logger.info('Parse the xml text.')
urls = []
DOMTree = parseString(_xml)
collection = DOMTree.documentElement
if collection.tagName == 'urlset':
urlsElem = collection.getElementsByTagName('url')
for elem in urlsElem:
loc = elem.getElementsByTagName('loc')
lastmod = elem.getElementsByTagName('lastmod')
url = loc[0].childNodes[0].nodeValue
date = lastmod[0].childNodes[0].nodeValue
# Only ten records one day, so we have to sort the data
# TODO get the last modified 10 URL and submit
if date is not None:
date = datetime.strptime(date, '%Y-%m-%d')
urls.append({'url': url, 'date': date})
if len(urls) > 0:
urls = sorted(urls, key= lambda item : item['date'], reverse=True)
if len(urls) > 10:
urls = urls[:10]
urls = [item['url'] for item in urls]
return urls
return None
def submit_urls(urls):
_URL = '/urls?site=https://www.xtigerkin.com&token=' + TOKEN
logger.info('Try to submit urls in sitemap.xml.')
conn = http.client.HTTPConnection('data.zz.baidu.com')
payload = str.join('\n', urls)
headers = {
'User-Agent': USER_AGENT,
'Content-Type': 'text/plain'
}
print(payload)
conn.request("POST", _URL, payload, headers)
res = conn.getresponse()
data = res.read().decode('utf-8')
if res.status == 200:
logger.info('Submit urls in sitemap.xml successfully.')
else:
logger.info('Try to submit urls in sitemap.xml failed.')
logger.info(data)
return False
pass
def main():
sitemap_xml = get_sitemap_xml(URL)
if sitemap_xml is None:
logger.info('Get sitemap.xml error.')
sys.exit(-1)
urls = parse_xml(sitemap_xml)
if urls is None:
logger.info('Parse xml error.')
sys.exit(-1)
# print(urls)
submit_urls(urls)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment