Skip to content

Instantly share code, notes, and snippets.

@tadaken3
Last active July 17, 2019 09:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tadaken3/9b96fd718f57ed278d0a5f169f1c05c5 to your computer and use it in GitHub Desktop.
Save tadaken3/9b96fd718f57ed278d0a5f169f1c05c5 to your computer and use it in GitHub Desktop.
wikipediaからのページからリンク一覧を取得する方法 ref: https://qiita.com/tadaken3/items/e09ba2ede988bbacb303
#codeing:utf-8
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import unquote
url = "https://ja.wikipedia.org/wiki/%E3%83%86%E3%82%A4%E3%83%AB%E3%82%BA_%E3%82%AA%E3%83%96_%E3%82%A4%E3%83%8E%E3%82%BB%E3%83%B3%E3%82%B9"
html = urlopen(url)
bsObj = BeautifulSoup(html,'html.parser')
pattern = re.compile("^(/wiki/)((?!:).)*$")
for link in bsObj.find('div',{'id':'bodyContent'}).findAll('a',href = pattern):
if 'href' in link.attrs:
print (unquote(link.attrs['href']))
#codeing:utf-8
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import unquote
url = "https://ja.wikipedia.org/wiki/%E3%83%86%E3%82%A4%E3%83%AB%E3%82%BA_%E3%82%AA%E3%83%96_%E3%82%A4%E3%83%8E%E3%82%BB%E3%83%B3%E3%82%B9"
html = urlopen(url)
bsObj = BeautifulSoup(html,'html.parser')
pattern = re.compile("^(/wiki/)((?!:).)*$")
for link in bsObj.find('div',{'id':'bodyContent'}).findAll('a',href = pattern):
if 'href' in link.attrs:
print (unquote(link.attrs['href']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment