Last active
February 28, 2022 04:44
-
-
Save knktc/846950067e60a92612c1befbe4213a32 to your computer and use it in GitHub Desktop.
用于自动向百度提交url的脚本,将会从sitemap中解析url,然后向百度进行提交
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
向百度提交sitemap的脚本 | |
详细使用方法请参考: https://knktc.com/2022/02/27/submit-baidu-sitemap-by-github-actions/ | |
@author:knktc | |
@contact:me@knktc.com | |
@create:2022-02-12 22:49 | |
""" | |
import time | |
import argparse | |
from urllib import request | |
from urllib.parse import urljoin | |
import xml.etree.ElementTree as ET | |
def chunker(seq, size): | |
""" iterate list by chunk """ | |
return (seq[pos:pos + size] for pos in range(0, len(seq), size)) | |
class BaiduSubmitter: | |
def __init__(self, site: str, token: str, sitemap: str): | |
self.submit_url = self.gen_submit_url(site, token) | |
self.sitemap_url = self.gen_sitemap_url(site, sitemap) | |
@staticmethod | |
def gen_submit_url(site: str, token: str) -> str: | |
""" generate url to submit to """ | |
return f'http://data.zz.baidu.com/urls?site={site}&token={token}' | |
@staticmethod | |
def gen_sitemap_url(site: str, sitemap: str) -> str: | |
""" generate url path to get sitemap """ | |
return urljoin(site, sitemap) | |
@staticmethod | |
def get_links_from_sitemap(sitemap_url) -> list: | |
""" download sitemap, parse and get urls """ | |
with request.urlopen(sitemap_url) as resp: | |
data = resp.read() | |
root = ET.fromstring(data) | |
return [_.text for | |
_ in root.findall('./{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc')] | |
@staticmethod | |
def submit(submit_url: str, links: list): | |
""" submit to baidu """ | |
data = '\n'.join(links).encode('utf8') | |
req = request.Request(submit_url, data=data) | |
return request.urlopen(req).read().decode() | |
def run(self, chunk_size=20, sleep_time=0.1): | |
""" submit process """ | |
links = self.get_links_from_sitemap(self.sitemap_url) | |
print(f'Get {len(links)} links from sitemap: [{self.sitemap_url}]') | |
for chunk in chunker(links, chunk_size): | |
resp = self.submit(self.submit_url, chunk) | |
print(resp) | |
if sleep_time: | |
time.sleep(sleep_time) | |
time.sleep(1) | |
def get_args(): | |
""" get cli args """ | |
parser = argparse.ArgumentParser(description='Submit sitemap to Baidu') | |
parser.add_argument('--site', '-s', type=str, dest='site', required=True, | |
help='your site, eg: https://knktc.com') | |
parser.add_argument('--token', '-t', type=str, dest='token', required=True, | |
help='baidu ziyuan token, you may find your token in https://ziyuan.baidu.com/linksubmit') | |
parser.add_argument('--sitemap', '-p', type=str, dest='sitemap', default='sitemap.xml', | |
help='url path to get sitemap.xml file, default: sitemap.xml') | |
parser.add_argument('--chunk', '-c', type=int, dest='chunk_size', default=100, | |
help='how many urls should be submitted each time') | |
args = parser.parse_args() | |
return args | |
def main(): | |
""" | |
main process | |
""" | |
args = get_args() | |
site = args.site | |
token = args.token | |
sitemap_path = args.sitemap | |
chunk_size = args.chunk_size | |
submitter = BaiduSubmitter(site, token, sitemap_path) | |
submitter.run(chunk_size=chunk_size) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment