Last active
November 16, 2023 01:42
-
-
Save mrchipset/7deacb2ef01b43ee356ca58c6e85fac1 to your computer and use it in GitHub Desktop.
Submit top 10 last modified url to baidu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import http.client | |
import json | |
import sys | |
from datetime import datetime | |
from xml.dom.minidom import parseString | |
import xml.dom.minidom | |
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' | |
URL = 'www.xtigerkin.com' | |
TOKEN = '<YOUR API TOKEN>' | |
logger = logging.getLogger("submit_baiduspider") | |
logger.setLevel(logging.INFO) | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
# add a handle | |
file_handler = logging.FileHandler('/var/log/submit_baiduspider.log') | |
file_handler.setLevel(logging.INFO) | |
# add a formatter | |
file_handler.setFormatter(formatter) | |
logger.addHandler(file_handler) | |
stream_handler = logging.StreamHandler() | |
stream_handler.setLevel(logging.INFO) | |
stream_handler.setFormatter(formatter) | |
logger.addHandler(stream_handler) | |
def get_sitemap_xml(url): | |
logger.info('Try to get sitemap.xml.') | |
conn = http.client.HTTPSConnection(url) | |
payload = '' | |
headers = { | |
'User-Agent': USER_AGENT | |
} | |
conn.request("GET", "/sitemap.xml", payload, headers) | |
res = conn.getresponse() | |
if res.status == 200: | |
data = res.read().decode('utf-8') | |
return data | |
else: | |
return None | |
def parse_xml(_xml): | |
""" | |
Solve nodeValue is none by this post; | |
https://blog.csdn.net/u011478909/article/details/52103529 | |
""" | |
logger.info('Parse the xml text.') | |
urls = [] | |
DOMTree = parseString(_xml) | |
collection = DOMTree.documentElement | |
if collection.tagName == 'urlset': | |
urlsElem = collection.getElementsByTagName('url') | |
for elem in urlsElem: | |
loc = elem.getElementsByTagName('loc') | |
lastmod = elem.getElementsByTagName('lastmod') | |
url = loc[0].childNodes[0].nodeValue | |
date = lastmod[0].childNodes[0].nodeValue | |
# Only ten records one day, so we have to sort the data | |
# TODO get the last modified 10 URL and submit | |
if date is not None: | |
date = datetime.strptime(date, '%Y-%m-%d') | |
urls.append({'url': url, 'date': date}) | |
if len(urls) > 0: | |
urls = sorted(urls, key= lambda item : item['date'], reverse=True) | |
if len(urls) > 10: | |
urls = urls[:10] | |
urls = [item['url'] for item in urls] | |
return urls | |
return None | |
def submit_urls(urls): | |
_URL = '/urls?site=https://www.xtigerkin.com&token=' + TOKEN | |
logger.info('Try to submit urls in sitemap.xml.') | |
conn = http.client.HTTPConnection('data.zz.baidu.com') | |
payload = str.join('\n', urls) | |
headers = { | |
'User-Agent': USER_AGENT, | |
'Content-Type': 'text/plain' | |
} | |
print(payload) | |
conn.request("POST", _URL, payload, headers) | |
res = conn.getresponse() | |
data = res.read().decode('utf-8') | |
if res.status == 200: | |
logger.info('Submit urls in sitemap.xml successfully.') | |
else: | |
logger.info('Try to submit urls in sitemap.xml failed.') | |
logger.info(data) | |
return False | |
pass | |
def main(): | |
sitemap_xml = get_sitemap_xml(URL) | |
if sitemap_xml is None: | |
logger.info('Get sitemap.xml error.') | |
sys.exit(-1) | |
urls = parse_xml(sitemap_xml) | |
if urls is None: | |
logger.info('Parse xml error.') | |
sys.exit(-1) | |
# print(urls) | |
submit_urls(urls) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment