Created
February 25, 2019 12:13
-
-
Save salgo60/e91252178f2a14a9137b1ef72b1dbe22 to your computer and use it in GitHub Desktop.
Extrahera youtube film id och person namn för att ladda till Wikidata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Extract the yuotube id and the title from a youtube playlist""" | |
from bs4 import BeautifulSoup | |
import requests | |
import urllib | |
import urllib.parse as urlparse | |
youtubePlaylist = "https://www.youtube.com/playlist?list=PL072ED1B5CB4BBFCD" | |
def clean_title(title): | |
cleaned_title = title.replace("People at KI: ","").replace("Professor of ","").replace("Möt ","").replace("Professor ","") | |
if ',' in cleaned_title: | |
cleaned_title = cleaned_title.split(",")[0] | |
return cleaned_title | |
def get_youtube_playlist_links(url): | |
source_code = requests.get(url).text | |
soup = BeautifulSoup(source_code, 'html.parser') | |
i = 0 | |
for link in soup.find_all("a", {"dir": "ltr"}): | |
href = link.get('href') | |
if href.startswith('/watch?'): | |
o = urllib.parse.urlparse(href) | |
print(urlparse.parse_qs(o.query)['v'][0] ,"|",link.string.strip(), "|",clean_title(link.string.strip())) | |
i += 1 | |
print("Number items in youtube list: ",i) | |
get_youtube_playlist_links(youtubePlaylist) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Exempel output som sedan blir länkad data i Wikidata länk
cJz_FDEfzqM | Möt Maria Ankarcrona, professor vid Karolinska Institutet | Maria Ankarcrona
7oKayj8CNFQ | Möt Lena Palmberg, professor vid Karolinska Institutet | Lena Palmberg
lCLsP2alFGM | Möt Lars Lund, professor vid Karolinska Institutet | Lars Lund
C2EogOlqUeo | Möt Elisabet Stener-Victorin | Elisabet Stener-Victorin
Cf8gkNKyfl4 | Möt Eling de Bruin, professor vid Karolinska Institutet | Eling de Bruin
zxQkFOo0bUY | Möt Christian Giske, professor vid Karolinska Institutet | Christian Giske
za0F-BRjWFo | Möt Gilad Silberberg, professor vid Karolinska Institutet | Gilad Silberberg
l3U5pKH4mfc | People at KI: Michaela Sundqvist | Michaela Sundqvist
D3XuO-f-u9Q | People at KI: Johanna Gripenberg | Johanna Gripenberg
zSzddx-wZCM | People at KI: Nailin Li | Nailin Li
s78jwQ1BXNk | People at KI: Emily Ip | Emily Ip
LCF-J4r71yM | Magnus Nilsson, Professor of Surgery | Magnus Nilsson
5GVjNhXXmJc | Richard Rosenquist Brandell, Professor of Clinical Genetics | Richard Rosenquist Brandell
LwhTZoaWHQE | Lennart Blomqvist, Professor of Medical Radiology specialising in Oncology | Lennart Blomqvist
X1ZUcCeYE0Y | Johan Ärnlöv, Professor of Family Medicine at the Department of Neurobiology | Johan Ärnlöv
hpNUS0LZa10 | Jan Zedenius, Professor of Surgery specialising in Endocrine Surgery | Jan Zedenius
u3avR0-razQ | Ingrid Kockum, Professor of Genetic Epidemiology | Ingrid Kockum
bGrzNpE_H4A | Helle Kieler, Professor of Pharmacoepidemiology | Helle Kieler
jHYoERr7ljI | Gunnar Schulte, Professor of Receptor Pharmacology | Gunnar Schulte
rV_ZY962-cg | Harri Alenius, Professor of Molecular Toxicology | Harri Alenius
PFOFOwc1-d0 | Angel Cedazo-Minguez, Professor of Molecular Neurogeriatrics | Angel Cedazo-Minguez