Skip to content

Instantly share code, notes, and snippets.

@parkjinwoo
Last active December 24, 2015 01:09
Show Gist options
  • Save parkjinwoo/6722045 to your computer and use it in GitHub Desktop.
Save parkjinwoo/6722045 to your computer and use it in GitHub Desktop.
Scraper
#!/usr/bin/env perl
use strict;
use LWP::Simple;
binmode(STDOUT, ":utf8");
my $url = "http://www.naver.com";
my $content = get($url);
die "Couldn't get it!" unless defined $content;
if ($content =~ m/<ol.*?id="realrank"(.*?)<\/ol>/gsmi){
my $list = $1;
while ($list =~ m/<li value="(\d+)".*?><a href.*?title="(.*?)">.*?<\/a>.*?<\/li>/gsmi){
print "$1\t$2\n";
}
}
#!/usr/bin/env perl
use strict;
use LWP::Simple;
binmode(STDOUT, ":utf8");
my $url = "http://movie.daum.net/movieinfo/now/movieinfoReleased.do?modeType=all&order=recently";
my $content = get($url);
die "Couldn't get it!" unless defined $content;
while ($content =~ m/<strong><a[\s]href\=\"http:\/\/movie\.daum\.net\/moviedetail\/moviedetailMain.do\?movieId=[\d]{1,6}\"[\s]title=\".*?\">(.*?)<\/a><\/strong>/gsmi){
my $list = $1;
print $list . "\n";
}
#!/usr/bin/env python
from lxml.cssselect import CSSSelector
import lxml.html
import httplib2
h = httplib2.Http(".cache")
resp, content = h.request("http://movie.daum.net/movieinfo/now/movieinfoReleased.do?modeType=all&order=recently", "GET")
doc = lxml.html.fromstring(content);
sel = CSSSelector('#div_nowMovieTab_1 dl dt strong a')
for e in sel(doc):
print e.get('title');
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'open-uri'
page = Nokogiri::HTML(open("http://movie.daum.net/movieinfo/now/movieinfoReleased.do?modeType=all&order=recently"))
page.css("#div_nowMovieTab_1 dl dt strong a").each{|link| puts link["title"]}
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'open-uri'
page = Nokogiri::HTML(open("http://movie.daum.net/movieinfo/now/movieinfoReleased.do?modeType=all&order=recently"))
page.css("#div_nowMovieTab_1 dl dt strong a").each do |link|
movie_info = Nokogiri::HTML(open(link["href"]))
puts movie_info.css("#movieinfoDetail strong.title_kor").text
puts movie_info.css("#movieinfoDetail span.eng").text
puts movie_info.css("#movieinfoDetail p.poster img").attribute('src').to_s.gsub(/C[\d]{2,3}x[\d]{2,3}/, "R678x0")
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment