Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Last active February 11, 2016 23:11
Show Gist options
  • Save thiagomarzagao/1fa0c6776ab207a7a086 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/1fa0c6776ab207a7a086 to your computer and use it in GitHub Desktop.
HTML that shows mangled code block
<div class="highlight"><pre><code class="language-python" data-lang="python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88</pre></td><td class="code"><pre><span class="s">'''
scrape lyrics from vagalume.com.br
(author: thiagomarzagao.com)
'''</span>
<span class="kn">import</span> <span class="nn">json</span>
<span class="kn">import</span> <span class="nn">time</span>
<span class="kn">import</span> <span class="nn">pickle</span>
<span class="kn">import</span> <span class="nn">requests</span>
<span class="kn">from</span> <span class="nn">bs4</span> <span class="kn">import</span> <span class="n">BeautifulSoup</span>
<span class="c"># get each genre's URL</span>
<span class="n">basepath</span> <span class="o">=</span> <span class="s">'http://www.vagalume.com.br'</span>
<span class="n">r</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">basepath</span> <span class="o">+</span> <span class="s">'/browse/style/'</span><span class="p">)</span>
<span class="n">soup</span> <span class="o">=</span> <span class="n">BeautifulSoup</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">text</span><span class="p">)</span>
<span class="n">genres</span> <span class="o">=</span> <span class="p">[</span><span class="s">u'Rock'</span><span class="p">]</span>
<span class="s">u'Ax</span><span class="se">\u00E9</span><span class="s">'</span><span class="p">,</span>
<span class="s">u'Forr</span><span class="se">\u00F3</span><span class="s">'</span><span class="p">,</span>
<span class="s">u'Pagode'</span><span class="p">,</span>
<span class="s">u'Samba'</span><span class="p">,</span>
<span class="s">u'Sertanejo'</span><span class="p">,</span>
<span class="s">u'MPB'</span><span class="p">,</span>
<span class="s">u'Rap'</span><span class="p">]</span>
<span class="n">genre_urls</span> <span class="o">=</span> <span class="p">{}</span>
<span class="k">for</span> <span class="n">genre</span> <span class="ow">in</span> <span class="n">genres</span><span class="p">:</span>
<span class="n">genre_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">]</span> <span class="o">=</span> <span class="n">soup</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s">'a'</span><span class="p">,</span> <span class="n">class_</span> <span class="o">=</span> <span class="s">'eA'</span><span class="p">,</span> <span class="n">text</span> <span class="o">=</span> <span class="n">genre</span><span class="p">)</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">'href'</span><span class="p">)</span>
<span class="c"># get each artist's URL, per genre</span>
<span class="n">artist_urls</span> <span class="o">=</span> <span class="p">{</span><span class="n">e</span><span class="p">:</span> <span class="p">[]</span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">genres</span><span class="p">}</span>
<span class="k">for</span> <span class="n">genre</span> <span class="ow">in</span> <span class="n">genres</span><span class="p">:</span>
<span class="n">r</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">basepath</span> <span class="o">+</span> <span class="n">genre_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">])</span>
<span class="n">soup</span> <span class="o">=</span> <span class="n">BeautifulSoup</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">text</span><span class="p">)</span>
<span class="n">counter</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">artist</span> <span class="ow">in</span> <span class="n">soup</span><span class="o">.</span><span class="n">find_all</span><span class="p">(</span><span class="s">'a'</span><span class="p">,</span> <span class="n">class_</span> <span class="o">=</span> <span class="s">'top'</span><span class="p">):</span>
<span class="n">counter</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">print</span> <span class="s">'artist {} </span><span class="se">\r</span><span class="s">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">counter</span><span class="p">)</span>
<span class="n">artist_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">basepath</span> <span class="o">+</span> <span class="n">artist</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">'href'</span><span class="p">))</span>
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> <span class="c"># don't reduce the 2-second wait (here or below) or you get errors</span>
<span class="c"># get each lyrics, per genre</span>
<span class="n">api</span> <span class="o">=</span> <span class="s">'http://api.vagalume.com.br/search.php?musid='</span>
<span class="n">genre_lyrics</span> <span class="o">=</span> <span class="p">{</span><span class="n">e</span><span class="p">:</span> <span class="p">{}</span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">genres</span><span class="p">}</span>
<span class="k">for</span> <span class="n">genre</span> <span class="ow">in</span> <span class="n">artist_urls</span><span class="p">:</span>
<span class="k">print</span> <span class="nb">len</span><span class="p">(</span><span class="n">artist_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">])</span>
<span class="n">counter</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">artist1</span> <span class="o">=</span> <span class="bp">None</span>
<span class="k">for</span> <span class="n">url</span> <span class="ow">in</span> <span class="n">artist_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">]:</span>
<span class="n">success</span> <span class="o">=</span> <span class="bp">False</span>
<span class="k">while</span> <span class="ow">not</span> <span class="n">success</span><span class="p">:</span> <span class="c"># foor loop in case your connection flickers</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">r</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">url</span><span class="p">)</span>
<span class="n">success</span> <span class="o">=</span> <span class="bp">True</span>
<span class="k">except</span><span class="p">:</span>
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="n">soup</span> <span class="o">=</span> <span class="n">BeautifulSoup</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">text</span><span class="p">)</span>
<span class="n">hrefs</span> <span class="o">=</span> <span class="n">soup</span><span class="o">.</span><span class="n">find_all</span><span class="p">(</span><span class="s">'a'</span><span class="p">)</span>
<span class="k">for</span> <span class="n">href</span> <span class="ow">in</span> <span class="n">hrefs</span><span class="p">:</span>
<span class="k">if</span> <span class="n">href</span><span class="o">.</span><span class="n">has_attr</span><span class="p">(</span><span class="s">'data-song'</span><span class="p">):</span>
<span class="n">song_id</span> <span class="o">=</span> <span class="n">href</span><span class="p">[</span><span class="s">'data-song'</span><span class="p">]</span>
<span class="k">print</span> <span class="n">song_id</span>
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="n">success</span> <span class="o">=</span> <span class="bp">False</span>
<span class="k">while</span> <span class="ow">not</span> <span class="n">success</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">song_metadata</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">api</span> <span class="o">+</span> <span class="n">song_id</span><span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
<span class="n">success</span> <span class="o">=</span> <span class="bp">True</span>
<span class="k">except</span><span class="p">:</span>
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="k">if</span> <span class="s">'mus'</span> <span class="ow">in</span> <span class="n">song_metadata</span><span class="p">:</span>
<span class="k">if</span> <span class="s">'lang'</span> <span class="ow">in</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'mus'</span><span class="p">][</span><span class="mi">0</span><span class="p">]:</span> <span class="c"># discard if no language info</span>
<span class="n">language</span> <span class="o">=</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'mus'</span><span class="p">][</span><span class="mi">0</span><span class="p">][</span><span class="s">'lang'</span><span class="p">]</span>
<span class="k">if</span> <span class="n">language</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> <span class="c"># discard if language != Portuguese</span>
<span class="k">if</span> <span class="s">'text'</span> <span class="ow">in</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'mus'</span><span class="p">][</span><span class="mi">0</span><span class="p">]:</span> <span class="c"># discard if no lyrics</span>
<span class="n">artist2</span> <span class="o">=</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'art'</span><span class="p">][</span><span class="s">'name'</span><span class="p">]</span>
<span class="k">if</span> <span class="n">artist2</span> <span class="o">!=</span> <span class="n">artist1</span><span class="p">:</span>
<span class="k">if</span> <span class="n">counter</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">print</span> <span class="n">artist1</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s">'utf-8'</span><span class="p">)</span> <span class="c"># change as needed</span>
<span class="n">genre_lyrics</span><span class="p">[</span><span class="n">genre</span><span class="p">][</span><span class="n">artist1</span><span class="p">]</span> <span class="o">=</span> <span class="n">artist_lyrics</span>
<span class="n">artist1</span> <span class="o">=</span> <span class="n">artist2</span>
<span class="n">artist_lyrics</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">lyrics</span> <span class="o">=</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'mus'</span><span class="p">][</span><span class="mi">0</span><span class="p">][</span><span class="s">'text'</span><span class="p">]</span>
<span class="n">artist_lyrics</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">lyrics</span><span class="p">)</span>
<span class="n">counter</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">print</span> <span class="s">'lyrics {} </span><span class="se">\r</span><span class="s">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">counter</span><span class="p">)</span>
<span class="c"># serialize</span>
<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">genre</span> <span class="o">+</span> <span class="s">'.json'</span><span class="p">,</span> <span class="n">mode</span> <span class="o">=</span> <span class="s">'wb'</span><span class="p">)</span> <span class="k">as</span> <span class="n">fbuffer</span><span class="p">:</span>
<span class="n">json</span><span class="o">.</span><span class="n">dump</span><span class="p">(</span><span class="n">genre_lyrics</span><span class="p">[</span><span class="n">genre</span><span class="p">],</span> <span class="n">fbuffer</span><span class="p">)</span><span class="w">
</span></pre></td></tr></tbody></table>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment