Skip to content

Instantly share code, notes, and snippets.

@Kazuki-tam
Last active February 15, 2021 15:39
Show Gist options
  • Save Kazuki-tam/c997859b0e0bae2c1234198b29896482 to your computer and use it in GitHub Desktop.
Save Kazuki-tam/c997859b0e0bae2c1234198b29896482 to your computer and use it in GitHub Desktop.
Extract site url with Colaboratory
# ライブラリインポート
from google.colab import files
from bs4 import BeautifulSoup
import requests
import re
# 対象サイトのsitemap.xmlを指定
response = requests.get('https://example.com/sitemap.xml')
result = response.content
url_lists = ''
bs = BeautifulSoup(result, 'html.parser')
loc_list = bs.select('loc')
for loc in loc_list:
url_lists += re.sub('<[a-z]>', '', loc.text)
url_lists += '\n'
# サイトURL一覧出力
print('#'*50)
print('▼サイトURL一覧')
print(url_lists)
print('#'*50)
# サイトURL一覧ダウンロード
with open('sitemap.txt', 'w') as f:
f.write('▼サイトURL一覧\n')
f.write('#'*50 + '\n')
f.write(url_lists)
files.download('sitemap.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment