Skip to content

Instantly share code, notes, and snippets.

@knktc
Last active February 28, 2022 04:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save knktc/846950067e60a92612c1befbe4213a32 to your computer and use it in GitHub Desktop.
Save knktc/846950067e60a92612c1befbe4213a32 to your computer and use it in GitHub Desktop.
用于自动向百度提交url的脚本,将会从sitemap中解析url,然后向百度进行提交
#!/usr/bin/env python3
"""
向百度提交sitemap的脚本
详细使用方法请参考: https://knktc.com/2022/02/27/submit-baidu-sitemap-by-github-actions/
@author:knktc
@contact:me@knktc.com
@create:2022-02-12 22:49
"""
import time
import argparse
from urllib import request
from urllib.parse import urljoin
import xml.etree.ElementTree as ET
def chunker(seq, size):
""" iterate list by chunk """
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
class BaiduSubmitter:
def __init__(self, site: str, token: str, sitemap: str):
self.submit_url = self.gen_submit_url(site, token)
self.sitemap_url = self.gen_sitemap_url(site, sitemap)
@staticmethod
def gen_submit_url(site: str, token: str) -> str:
""" generate url to submit to """
return f'http://data.zz.baidu.com/urls?site={site}&token={token}'
@staticmethod
def gen_sitemap_url(site: str, sitemap: str) -> str:
""" generate url path to get sitemap """
return urljoin(site, sitemap)
@staticmethod
def get_links_from_sitemap(sitemap_url) -> list:
""" download sitemap, parse and get urls """
with request.urlopen(sitemap_url) as resp:
data = resp.read()
root = ET.fromstring(data)
return [_.text for
_ in root.findall('./{http://www.sitemaps.org/schemas/sitemap/0.9}url/{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]
@staticmethod
def submit(submit_url: str, links: list):
""" submit to baidu """
data = '\n'.join(links).encode('utf8')
req = request.Request(submit_url, data=data)
return request.urlopen(req).read().decode()
def run(self, chunk_size=20, sleep_time=0.1):
""" submit process """
links = self.get_links_from_sitemap(self.sitemap_url)
print(f'Get {len(links)} links from sitemap: [{self.sitemap_url}]')
for chunk in chunker(links, chunk_size):
resp = self.submit(self.submit_url, chunk)
print(resp)
if sleep_time:
time.sleep(sleep_time)
time.sleep(1)
def get_args():
""" get cli args """
parser = argparse.ArgumentParser(description='Submit sitemap to Baidu')
parser.add_argument('--site', '-s', type=str, dest='site', required=True,
help='your site, eg: https://knktc.com')
parser.add_argument('--token', '-t', type=str, dest='token', required=True,
help='baidu ziyuan token, you may find your token in https://ziyuan.baidu.com/linksubmit')
parser.add_argument('--sitemap', '-p', type=str, dest='sitemap', default='sitemap.xml',
help='url path to get sitemap.xml file, default: sitemap.xml')
parser.add_argument('--chunk', '-c', type=int, dest='chunk_size', default=100,
help='how many urls should be submitted each time')
args = parser.parse_args()
return args
def main():
"""
main process
"""
args = get_args()
site = args.site
token = args.token
sitemap_path = args.sitemap
chunk_size = args.chunk_size
submitter = BaiduSubmitter(site, token, sitemap_path)
submitter.run(chunk_size=chunk_size)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment