|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> |
|
<html> |
|
<head> |
|
<meta http-equiv="content-type" content="text/html; charset=UTF-8"> |
|
<title>~/projects/jd_crawler/my_runner.rb.html</title> |
|
<meta name="Generator" content="Vim/7.4"> |
|
<meta name="plugin-version" content="vim7.4_v1"> |
|
<meta name="syntax" content="ruby"> |
|
<meta name="settings" content="number_lines,use_css,no_foldcolumn,expand_tabs,line_ids,prevent_copy="> |
|
<meta name="colorscheme" content="github"> |
|
<style type="text/css"> |
|
<!-- |
|
pre { font-family: monospace; color: #000000; background-color: #f8f8ff; } |
|
body { font-family: monospace; color: #000000; background-color: #f8f8ff; } |
|
* { font-size: 1em; } |
|
.Function { color: #990000; font-weight: bold; } |
|
.Statement { color: #000000; font-weight: bold; } |
|
.PreProc { color: #a0a0a0; font-weight: bold; } |
|
.Type { color: #445588; font-weight: bold; } |
|
.String { color: #d81745; } |
|
.Number { color: #1c9898; } |
|
.LineNr { color: #bbbbbb; background-color: #ececec; padding-bottom: 1px; } |
|
.Constant { color: #177f80; } |
|
.Special { color: #159828; font-weight: bold; } |
|
.Identifier { color: #0086b3; } |
|
--> |
|
</style> |
|
|
|
<script type='text/javascript'> |
|
<!-- |
|
|
|
/* function to open any folds containing a jumped-to line before jumping to it */ |
|
function JumpToLine() |
|
{ |
|
var lineNum; |
|
lineNum = window.location.hash; |
|
lineNum = lineNum.substr(1); /* strip off '#' */ |
|
|
|
if (lineNum.indexOf('L') == -1) { |
|
lineNum = 'L'+lineNum; |
|
} |
|
lineElem = document.getElementById(lineNum); |
|
/* Always jump to new location even if the line was hidden inside a fold, or |
|
* we corrected the raw number to a line ID. |
|
*/ |
|
if (lineElem) { |
|
lineElem.scrollIntoView(true); |
|
} |
|
return true; |
|
} |
|
if ('onhashchange' in window) { |
|
window.onhashchange = JumpToLine; |
|
} |
|
|
|
--> |
|
</script> |
|
</head> |
|
<body onload='JumpToLine();'> |
|
<pre id='vimCodeElement'> |
|
<span id="L1" class="LineNr"> 1 </span><span class="PreProc">require</span> <span class="String">'</span><span class="String">mechanize</span><span class="String">'</span> |
|
<span id="L2" class="LineNr"> 2 </span><span class="PreProc">require</span> <span class="String">'</span><span class="String">pry</span><span class="String">'</span> |
|
<span id="L3" class="LineNr"> 3 </span> |
|
<span id="L4" class="LineNr"> 4 </span><span class="PreProc">module</span> <span class="Type">Runner</span> |
|
<span id="L5" class="LineNr"> 5 </span> <span class="Type">URL_PREFIX</span> = <span class="String">"</span><span class="String"><a href="http://d.360buy.com/area/get?fid=">http://d.360buy.com/area/get?fid=</a></span><span class="String">"</span> |
|
<span id="L6" class="LineNr"> 6 </span> <span class="Type">CHILDREN_NODE_CONTAINER</span> = [] |
|
<span id="L7" class="LineNr"> 7 </span> <span class="Type">MAX_HEIGHT</span> = <span class="Number">3</span> |
|
<span id="L8" class="LineNr"> 8 </span> <span class="Type">AGENT</span> = <span class="Type">Mechanize</span>.new |
|
<span id="L9" class="LineNr"> 9 </span> |
|
<span id="L10" class="LineNr">10 </span> <span class="PreProc">def</span> <span class="Constant">self</span>.<span class="Function">get_page_until_succeed</span>(url) |
|
<span id="L11" class="LineNr">11 </span> <span class="Statement">begin</span> |
|
<span id="L12" class="LineNr">12 </span> puts <span class="String">"</span><span class="String">GET </span><span class="Special">#{</span>url.inspect<span class="Special">}</span><span class="String">"</span> |
|
<span id="L13" class="LineNr">13 </span> <span class="Statement">return</span> <span class="Type">AGENT</span>.get(url) |
|
<span id="L14" class="LineNr">14 </span> <span class="Statement">rescue</span> => e |
|
<span id="L15" class="LineNr">15 </span> puts e.inspect |
|
<span id="L16" class="LineNr">16 </span> puts <span class="String">"</span><span class="String">Request failed. Retrying...</span><span class="String">"</span> |
|
<span id="L17" class="LineNr">17 </span> get_page_until_succeed(url) |
|
<span id="L18" class="LineNr">18 </span> <span class="Statement">end</span> |
|
<span id="L19" class="LineNr">19 </span> <span class="PreProc">end</span> |
|
<span id="L20" class="LineNr">20 </span> |
|
<span id="L21" class="LineNr">21 </span> <span class="PreProc">def</span> <span class="Constant">self</span>.<span class="Function">get_page_then_parse_until_succeed</span>(url) |
|
<span id="L22" class="LineNr">22 </span> page = get_page_until_succeed(url) |
|
<span id="L23" class="LineNr">23 </span> |
|
<span id="L24" class="LineNr">24 </span> <span class="Statement">begin</span> |
|
<span id="L25" class="LineNr">25 </span> parse_result = <span class="Type">JSON</span>.parse(page.body) |
|
<span id="L26" class="LineNr">26 </span> puts <span class="String">"</span><span class="String">Parse result: </span><span class="Special">#{</span>parse_result.inspect<span class="Special">}</span><span class="String">"</span> |
|
<span id="L27" class="LineNr">27 </span> <span class="Statement">return</span> parse_result |
|
<span id="L28" class="LineNr">28 </span> <span class="Statement">rescue</span> => e |
|
<span id="L29" class="LineNr">29 </span> puts e.inspect |
|
<span id="L30" class="LineNr">30 </span> puts <span class="String">"</span><span class="String">Parse failed. Retrying...</span><span class="String">"</span> |
|
<span id="L31" class="LineNr">31 </span> get_page_then_parse_until_succeed(url) |
|
<span id="L32" class="LineNr">32 </span> <span class="Statement">end</span> |
|
<span id="L33" class="LineNr">33 </span> <span class="PreProc">end</span> |
|
<span id="L34" class="LineNr">34 </span> |
|
<span id="L35" class="LineNr">35 </span> <span class="PreProc">def</span> <span class="Constant">self</span>.<span class="Function">find_and_save_all_children</span>(parent, parent_height) |
|
<span id="L36" class="LineNr">36 </span> puts <span class="String">"</span><span class="String">Parent: </span><span class="Special">#{</span>parent<span class="Special">}</span><span class="String">"</span> |
|
<span id="L37" class="LineNr">37 </span> parse_result = get_page_then_parse_until_succeed(<span class="String">"</span><span class="Special">#{</span><span class="Type">URL_PREFIX</span><span class="Special">}#{</span>parent[<span class="String">'</span><span class="String">id</span><span class="String">'</span>]<span class="Special">}</span><span class="String">"</span>) |
|
<span id="L38" class="LineNr">38 </span> <span class="Statement">if</span> parse_result.class == <span class="Type">Array</span> <span class="Statement">and</span> <span class="Statement">not</span> parse_result.empty? |
|
<span id="L39" class="LineNr">39 </span> puts <span class="String">"</span><span class="String">All children of parent found.</span><span class="String">"</span> |
|
<span id="L40" class="LineNr">40 </span> <span class="Type">CHILDREN_NODE_CONTAINER</span>.concat(parse_result.map { |<span class="Identifier">child</span>| child[<span class="String">"</span><span class="String">parent_id</span><span class="String">"</span>] = parent[<span class="String">"</span><span class="String">id</span><span class="String">"</span>] }) |
|
<span id="L41" class="LineNr">41 </span> |
|
<span id="L42" class="LineNr">42 </span> children_height = parent_height + <span class="Number">1</span> |
|
<span id="L43" class="LineNr">43 </span> <span class="Statement">if</span> (children_height >= <span class="Number">0</span>) <span class="Statement">and</span> (children_height < <span class="Type">MAX_HEIGHT</span>) |
|
<span id="L44" class="LineNr">44 </span> parse_result.each { |<span class="Identifier">child</span>| find_and_save_all_children(child, children_height) } |
|
<span id="L45" class="LineNr">45 </span> <span class="Statement">end</span> |
|
<span id="L46" class="LineNr">46 </span> <span class="Statement">end</span> |
|
<span id="L47" class="LineNr">47 </span> <span class="PreProc">end</span> |
|
<span id="L48" class="LineNr">48 </span> |
|
<span id="L49" class="LineNr">49 </span> <span class="PreProc">def</span> <span class="Constant">self</span>.<span class="Function">run</span> |
|
<span id="L50" class="LineNr">50 </span> puts <span class="String">"</span><span class="String">---------START-----------</span><span class="String">"</span> |
|
<span id="L51" class="LineNr">51 </span> root_node_container = <span class="Type">JSON</span>.parse(<span class="Type">File</span>.open(<span class="String">"</span><span class="String">./root_nodes.json</span><span class="String">"</span>).read()) |
|
<span id="L52" class="LineNr">52 </span> <span class="Statement">if</span> <span class="Type">MAX_HEIGHT</span> > <span class="Number">0</span> |
|
<span id="L53" class="LineNr">53 </span> root_node_container.each { |<span class="Identifier">root_node</span>| find_and_save_all_children(root_node, <span class="Number">0</span>) } |
|
<span id="L54" class="LineNr">54 </span> <span class="Statement">end</span> |
|
<span id="L55" class="LineNr">55 </span> |
|
<span id="L56" class="LineNr">56 </span> <span class="Type">File</span>.open(<span class="String">"</span><span class="String">./jd_areas.json</span><span class="String">"</span>, <span class="String">"</span><span class="String">w</span><span class="String">"</span>) <span class="Statement">do</span> |<span class="Identifier">file</span>| |
|
<span id="L57" class="LineNr">57 </span> file.write <span class="Type">JSON</span>.generate(<span class="Type">CHILDREN_NODE_CONTAINER</span>) |
|
<span id="L58" class="LineNr">58 </span> <span class="Statement">end</span> |
|
<span id="L59" class="LineNr">59 </span> puts <span class="String">"</span><span class="String">----------END-----------</span><span class="String">"</span> |
|
<span id="L60" class="LineNr">60 </span> <span class="PreProc">end</span> |
|
<span id="L61" class="LineNr">61 </span><span class="PreProc">end</span> |
|
<span id="L62" class="LineNr">62 </span> |
|
<span id="L63" class="LineNr">63 </span><span class="Type">Runner</span>.run |
|
</pre> |
|
</body> |
|
</html> |
|
<!-- vim: set foldmethod=manual : --> |
|
|