Skip to content

Instantly share code, notes, and snippets.

@messense
Created August 7, 2012 01:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save messense/3280630 to your computer and use it in GitHub Desktop.
Save messense/3280630 to your computer and use it in GitHub Desktop.
抓取邻居的耳朵网站的有声电台mp3文件地址
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
import os
import base64
import urllib2
import re
import sqlite3
db = sqlite3.connect("fm.db")
cursor = db.cursor()
def base64_decode(s):
lens = len(s)
lenx = lens - (lens % 4 if lens % 4 else 4)
rs = ""
try:
rs = base64.b64decode(s)
except TypeError, e:
try:
s1 = s + "=" * (-lens % 4)
rs = base64.b64decode(s1)
except TypeError, er:
rs = base64.b64decode(s[:lenx])
return rs
def find_page_links(html):
pattern = re.compile("(?imu)href=\"(http:\/\/kxt\.fm\/\?p=\d+)\" title=\"[^'\"]*\" class=\"readmore\"")
return pattern.findall(html)
def fetch_audio_links(pages):
title_pattern = re.compile("(?imu)<h1 class=\"title\">([^'\"]+)</h1>")
url_pattern = re.compile("(?imu)soundFile:\"([^'\"]+)\"")
for page in pages:
try:
print("Fetching page %s" % page)
html = urllib2.urlopen(page).read()
except:
print("Fetch page %s failed" % page)
continue
m_title = title_pattern.search(html)
m_url = url_pattern.search(html)
title = ""
url = ""
if m_title:
title = m_title.group(1).decode("utf-8")
if m_url:
url = base64_decode(m_url.group(1))
sql = 'INSERT INTO fm(title,url) VALUES("%s","%s")' % (title, url)
print("SQL: %s" % sql)
cursor.execute(sql)
db.commit()
print("Added url: %s" % url)
def main():
# Get page count first
url = "http://kxt.fm/?tag=%E6%9C%89%E5%A3%B0%E7%94%B5%E5%8F%B0"
try:
html = urllib2.urlopen(url).read()
except:
print("Open url: %s failed" % url)
sys.exit()
page_count = int(re.search("(?imu)paged=(\d+)\" class=\"last\"", html).group(1))
print("Page count is %s" % page_count)
# start to enter every page to look for download links
index = 1
while index <= page_count:
print("Start to fetch page: %s&paged=%s" % (url, index))
pages = []
if index == 1:
pages = find_page_links(html)
else:
page_url = "%s&paged=%s" % (url, index)
try:
html = urllib2.urlopen(page_url).read()
pages = find_page_links(html)
except:
print("Fetch page %s failed" % page_url)
if pages:
fetch_audio_links(pages)
index += 1
cursor.close()
db.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment