Skip to content

Instantly share code, notes, and snippets.

@yaojenkuo
Created April 28, 2019 03:59
Show Gist options
  • Save yaojenkuo/624ed6ffa560a878cde101e57c1f61ee to your computer and use it in GitHub Desktop.
Save yaojenkuo/624ed6ffa560a878cde101e57c1f61ee to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!pip install -U pyquery selenium"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyquery import PyQuery as pq\n",
"from selenium import webdriver\n",
"from random import randint\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'movieRating': 9.1, 'movieReleaseDate': '24 April 2019 (Taiwan)', 'movieGenre': ['Action', 'Adventure', 'Fantasy'], 'moviePosterLink': 'https://m.media-amazon.com/images/M/MV5BMTc5MDE2ODcwNV5BMl5BanBnXkFtZTgwMzI2NzQ2NzM@._V1_UX182_CR0,0,182,268_AL_.jpg', 'movieCast': ['Robert Downey Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Don Cheadle', 'Paul Rudd', 'Benedict Cumberbatch', 'Chadwick Boseman', 'Brie Larson', 'Tom Holland', 'Karen Gillan', 'Zoe Saldana', 'Evangeline Lilly']}\n"
]
}
],
"source": [
"def get_movie_info(movie_url):\n",
" \"\"\"\n",
" Get movie info from certain IMDB url\n",
" \"\"\"\n",
" d = pq(movie_url)\n",
" movie_rating = float(d(\"strong span\").text())\n",
" movie_genre = [x.text() for x in d(\".subtext a\").items()]\n",
" movie_release_date = movie_genre.pop()\n",
" movie_poster = d(\".poster img\").attr('src')\n",
" movie_cast = [x.text() for x in d(\".primary_photo+ td a\").items()]\n",
"\n",
" # 回傳資訊\n",
" movie_info = {\n",
" \"movieRating\": movie_rating,\n",
" \"movieReleaseDate\": movie_release_date,\n",
" \"movieGenre\": movie_genre,\n",
" \"moviePosterLink\": movie_poster,\n",
" \"movieCast\": movie_cast\n",
" }\n",
" return movie_info\n",
"\n",
"end_game = get_movie_info(\"https://www.imdb.com/title/tt4154796\")\n",
"print(end_game)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"imdb_home = \"https://www.imdb.com/\"\n",
"driver = webdriver.Chrome(executable_path=\"YOURCHROMEDRIVERPATH\") # Use Chrome\n",
"driver.get(imdb_home)\n",
"print(driver.current_url)\n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"imdb_home = \"https://www.imdb.com/\"\n",
"driver = webdriver.Firefox(executable_path=\"YOURGECKODRIVERPATH\") # Use Firefox\n",
"driver.get(imdb_home)\n",
"print(driver.current_url)\n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def get_movies(*args):\n",
" \"\"\"\n",
" Get multiple movies' info from movie titles\n",
" \"\"\"\n",
" imdb_home = \"https://www.imdb.com/\"\n",
" driver = webdriver.Firefox(executable_path=\"PATHTOYOURGECKODRIVER\") # Use Firefox\n",
" #driver = webdriver.Chrome(executable_path=\"PATHTOYOURCHROMEDRIVER\") # Use Chrome\n",
" movies = dict()\n",
" for movie_title in args:\n",
" # 前往 IMDB 首頁\n",
" driver.get(imdb_home)\n",
" # 定位搜尋欄位\n",
" search_elem = driver.find_element_by_xpath(\"//input[@id='navbar-query']\")\n",
" # 輸入電影名稱\n",
" search_elem.send_keys(movie_title)\n",
" # 定位搜尋按鈕\n",
" submit_elem = driver.find_element_by_xpath(\"//div[@class='magnifyingglass navbarSprite']\")\n",
" # 按下搜尋按鈕\n",
" submit_elem.click()\n",
" # 限縮搜尋結果為「電影」類\n",
" category_movie_elem = driver.find_element_by_xpath(\"//ul[@class='findTitleSubfilterList']/li[1]/a\")\n",
" # 按下限縮搜尋結果\n",
" category_movie_elem.click()\n",
" # 定位搜尋結果連結\n",
" first_result_elem = driver.find_element_by_xpath(\"//tr[@class='findResult odd'][1]/td[@class='result_text']/a\")\n",
" # 按下搜尋結果連結\n",
" first_result_elem.click()\n",
" # 呼叫 get_movie_info()\n",
" current_url = driver.current_url\n",
" movie_info = get_movie_info(current_url)\n",
" movies[movie_title] = movie_info\n",
" time.sleep(randint(3, 8))\n",
" driver.close()\n",
" return movies"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Avengers: Endgame': {'movieRating': 9.1,\n",
" 'movieReleaseDate': '24 April 2019 (Taiwan)',\n",
" 'movieGenre': ['Action', 'Adventure', 'Fantasy'],\n",
" 'moviePosterLink': 'https://m.media-amazon.com/images/M/MV5BMTc5MDE2ODcwNV5BMl5BanBnXkFtZTgwMzI2NzQ2NzM@._V1_UX182_CR0,0,182,268_AL_.jpg',\n",
" 'movieCast': ['Robert Downey Jr.',\n",
" 'Chris Evans',\n",
" 'Mark Ruffalo',\n",
" 'Chris Hemsworth',\n",
" 'Scarlett Johansson',\n",
" 'Jeremy Renner',\n",
" 'Don Cheadle',\n",
" 'Paul Rudd',\n",
" 'Benedict Cumberbatch',\n",
" 'Chadwick Boseman',\n",
" 'Brie Larson',\n",
" 'Tom Holland',\n",
" 'Karen Gillan',\n",
" 'Zoe Saldana',\n",
" 'Evangeline Lilly']},\n",
" 'Captain Marvel': {'movieRating': 7.1,\n",
" 'movieReleaseDate': '6 March 2019 (Taiwan)',\n",
" 'movieGenre': ['Action', 'Adventure', 'Sci-Fi'],\n",
" 'moviePosterLink': 'https://m.media-amazon.com/images/M/MV5BMTE0YWFmOTMtYTU2ZS00ZTIxLWE3OTEtYTNiYzBkZjViZThiXkEyXkFqcGdeQXVyODMzMzQ4OTI@._V1_UX182_CR0,0,182,268_AL_.jpg',\n",
" 'movieCast': ['Brie Larson',\n",
" 'Samuel L. Jackson',\n",
" 'Ben Mendelsohn',\n",
" 'Jude Law',\n",
" 'Annette Bening',\n",
" 'Lashana Lynch',\n",
" 'Clark Gregg',\n",
" 'Rune Temte',\n",
" 'Gemma Chan',\n",
" 'Algenis Perez Soto',\n",
" 'Djimon Hounsou',\n",
" 'Lee Pace',\n",
" 'Chuku Modu',\n",
" 'Matthew Maher',\n",
" 'Akira Akbar']}}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_movies(\"Avengers: Endgame\", \"Captain Marvel\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "selenium",
"language": "python",
"name": "selenium"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment