Skip to content

Instantly share code, notes, and snippets.

@pascallijuan
Forked from ysc8620/python_.idea_.name
Created August 19, 2013 06:15
Show Gist options
  • Save pascallijuan/6266154 to your computer and use it in GitHub Desktop.
Save pascallijuan/6266154 to your computer and use it in GitHub Desktop.
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.3 (C:/Python27/python.exe)" project-jdk-type="Python SDK" />
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/python.iml" filepath="$PROJECT_DIR$/.idea/python.iml" />
</modules>
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PyDocumentationSettings">
<option name="myDocStringFormat" value="Plain" />
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
<component name="DependencyValidationManager">
<state>
<option name="SKIP_IMPORT_STATEMENTS" value="false" />
</state>
</component>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="TestRunnerService">
<option name="projectConfiguration" value="Unittests" />
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="" />
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
</project>
<component name="InspectionProjectProfileManager">
<settings>
<option name="PROJECT_PROFILE" />
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.3 (C:/Python27/python.exe)" project-jdk-type="Python SDK" />
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/newspider.iml" filepath="$PROJECT_DIR$/.idea/newspider.iml" />
</modules>
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PyDocumentationSettings">
<option name="myDocStringFormat" value="Plain" />
</component>
</project>
<component name="DependencyValidationManager">
<state>
<option name="SKIP_IMPORT_STATEMENTS" value="false" />
</state>
</component>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="TestRunnerService">
<option name="projectConfiguration" value="Unittests" />
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="" />
</component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="7dc9904d-98fe-4bed-8b89-bf5f25178704" name="Default" comment="" />
<ignored path="newspider.iws" />
<ignored path=".idea/workspace.xml" />
<option name="TRACKING_ENABLED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="ChangesViewManager" flattened_view="true" show_ignored="false" />
<component name="CreatePatchCommitExecutor">
<option name="PATCH_PATH" value="" />
</component>
<component name="DaemonCodeAnalyzer">
<disable_hints />
</component>
<component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
<component name="ProjectFrameBounds">
<option name="x" value="-8" />
<option name="y" value="-8" />
<option name="width" value="1936" />
<option name="height" value="1066" />
</component>
<component name="ProjectLevelVcsManager" settingsEditedManually="false">
<OptionsSetting value="true" id="Add" />
<OptionsSetting value="true" id="Remove" />
<OptionsSetting value="true" id="Checkout" />
<OptionsSetting value="true" id="Update" />
<OptionsSetting value="true" id="Status" />
<OptionsSetting value="true" id="Edit" />
<ConfirmationsSetting value="0" id="Add" />
<ConfirmationsSetting value="0" id="Remove" />
</component>
<component name="ProjectReloadState">
<option name="STATE" value="0" />
</component>
<component name="ProjectView">
<navigator currentView="ProjectPane" proportions="" version="1" splitterProportion="0.5">
<flattenPackages />
<showMembers />
<showModules />
<showLibraryContents />
<hideEmptyPackages />
<abbreviatePackageNames />
<autoscrollToSource />
<autoscrollFromSource />
<sortByType />
</navigator>
<panes>
<pane id="ProjectPane">
<subPane>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="newspider" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
</PATH>
</subPane>
</pane>
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/.." />
</component>
<component name="RunManager">
<list size="0" />
</component>
<component name="ShelveChangesManager" show_recycled="false" />
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<created>1366584457012</created>
<updated>1366584457012</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="-8" y="-8" width="1936" height="1066" extended-state="0" />
<editor active="false" />
<layout>
<window_info id="Changes" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="1" side_tool="true" content_ui="tabs" />
<window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" weight="0.2497332" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
</layout>
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="VcsManagerConfiguration">
<option name="OFFER_MOVE_TO_ANOTHER_CHANGELIST_ON_PARTIAL_COMMIT" value="true" />
<option name="CHECK_CODE_SMELLS_BEFORE_PROJECT_COMMIT" value="false" />
<option name="CHECK_NEW_TODO" value="true" />
<option name="myTodoPanelSettings">
<value>
<are-packages-shown value="false" />
<are-modules-shown value="false" />
<flatten-packages value="false" />
<is-autoscroll-to-source value="false" />
</value>
</option>
<option name="PERFORM_UPDATE_IN_BACKGROUND" value="true" />
<option name="PERFORM_COMMIT_IN_BACKGROUND" value="true" />
<option name="PERFORM_EDIT_IN_BACKGROUND" value="true" />
<option name="PERFORM_CHECKOUT_IN_BACKGROUND" value="true" />
<option name="PERFORM_ADD_REMOVE_IN_BACKGROUND" value="true" />
<option name="PERFORM_ROLLBACK_IN_BACKGROUND" value="false" />
<option name="CHECK_LOCALLY_CHANGED_CONFLICTS_IN_BACKGROUND" value="false" />
<option name="CHANGED_ON_SERVER_INTERVAL" value="60" />
<option name="SHOW_ONLY_CHANGED_IN_SELECTION_DIFF" value="true" />
<option name="CHECK_COMMIT_MESSAGE_SPELLING" value="true" />
<option name="DEFAULT_PATCH_EXTENSION" value="patch" />
<option name="SHORT_DIFF_HORISONTALLY" value="true" />
<option name="SHORT_DIFF_EXTRA_LINES" value="2" />
<option name="SOFT_WRAPS_IN_SHORT_DIFF" value="true" />
<option name="INCLUDE_TEXT_INTO_PATCH" value="false" />
<option name="INCLUDE_TEXT_INTO_SHELF" value="false" />
<option name="SHOW_FILE_HISTORY_DETAILS" value="true" />
<option name="SHOW_VCS_ERROR_NOTIFICATIONS" value="true" />
<option name="FORCE_NON_EMPTY_COMMENT" value="false" />
<option name="CLEAR_INITIAL_COMMIT_MESSAGE" value="false" />
<option name="LAST_COMMIT_MESSAGE" />
<option name="MAKE_NEW_CHANGELIST_ACTIVE" value="false" />
<option name="OPTIMIZE_IMPORTS_BEFORE_PROJECT_COMMIT" value="false" />
<option name="CHECK_FILES_UP_TO_DATE_BEFORE_COMMIT" value="false" />
<option name="REFORMAT_BEFORE_PROJECT_COMMIT" value="false" />
<option name="REFORMAT_BEFORE_FILE_COMMIT" value="false" />
<option name="FILE_HISTORY_DIALOG_COMMENTS_SPLITTER_PROPORTION" value="0.8" />
<option name="FILE_HISTORY_DIALOG_SPLITTER_PROPORTION" value="0.5" />
<option name="ACTIVE_VCS_NAME" />
<option name="UPDATE_GROUP_BY_PACKAGES" value="false" />
<option name="UPDATE_GROUP_BY_CHANGELIST" value="false" />
<option name="SHOW_FILE_HISTORY_AS_TREE" value="false" />
<option name="FILE_HISTORY_SPLITTER_PROPORTION" value="0.6" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager />
</component>
</project>
<?xml version="1.0" encoding="utf-8"?>
<root>
<site siteName="ffdy" readMode="normal" url="http://www.ffdy.cc/s" daily="0.3" charset="utf-8">
<linkRules>
<rule type="reg" value="(type/movie|movie)" />
</linkRules>
<targets>
<target name="info">
<urlRules>
<rule type="reg" value=".*/movie/(\d+).html" />
</urlRules>
<model dataType="array">
<field name="title">
<parsers>
<parser type="text" xpath="//h1/text()" />
</parsers>
</field>
<field name="url">
<parsers>
<parser type="pageurl" xpath="//h1/text()" />
</parsers>
</field>
<field name="detail_pic">
<parsers>
<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
</parsers>
</field>
<field name="director">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="leading">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="type">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="area">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="show_day">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期:']/../td[last()]/text()" code="u" />
</parsers>
</field>
<field name="comment">
<parsers>
<parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
</parsers>
</field>
</model>
</target>
</targets>
</site>
</root>
# -*- coding: utf-8 -*-
'''
爬虫引导入口 启用多线程处理该事物
'''
__author__ = 'ShengYue'
import threading
import time
from lxml import etree
import os
from model.log import *
from model.db import *
from model.curl import *
'''
爬虫调度接口
'''
class newsspider:
def __init__(self, xpath_file):
self.xpath_file = xpath_file
logging.info(u'-----------------------------------------------------------------------------')
logging.info(u'创建newsspider对象: '+xpath_file)
try:
if os.path.exists(xpath_file):
self.config_tree = etree.ElementTree(file=xpath_file)
sites = self.config_tree.xpath('//site')
if sites == []:
logging.error(u'网站配置文件格式不对'+xpath_file)
exit(0)
site = sites[0]
# 网址获取
self.site_url = site.get('url')
if self.site_url == None:
logging.error(u'网站配置文件网址获取失败'+xpath_file)
exit(0)
# 网站名
self.site_name = site.get('siteName')
self.daily = float(site.get('daily'))
# 网站链接读取规则
self.readMode = site.get('readMode')
print self.readMode
self.linkRule = self.config_tree.xpath('//linkRules/rule')
self.infoUrlRule = self.config_tree.xpath('//urlRules/rule')
self.infoRule = self.config_tree.xpath('//targets/target/model/field')
self.linkdb = linkdb(self.site_name)
self.run(self.site_url)
else:
logging.error(u'配置文件不存在: '+xpath_file)
exit(0)
except Exception, e:
logging.error(u'读取配置文件失败: '+xpath_file+', --'+e.message)
exit(0)
def run(self, url):
logging.info(u'开始执行配置文件: '+self.xpath_file)
if self.readMode == 'normal':
self.autoRead(url)
elif self.readMode == 'match':
print u'没有找到匹配模式'
else:
logging.error(u'没有找到读取规则'+self.xpath_file)
exit(0)
def autoRead(self, url):
time.sleep(self.daily)
try:
# 初次
if url != None:
url = self.site_url
else:
urlData = self.linkdb.get_url(self.site_name)
if urlData == None:
logging.info(self.url+u'读取成功')
exit(0)
url = urlData[1]
#更新
self.linkdb.update_url(urlData[0])
html = curl().read(url)
if html == None:
logging.error(u'获取HTML失败: '+url)
'''获取链接'''
except Exception, e:
logging.error(u'执行失败'+self.xpath_file+', --')
def close(self):
logging.info(u'执行完毕: ')
self.linkdb.close()
'''
创建线程实例
'''
class timer(threading.Thread): #The timer class is derived from the class threading.Thread
def __init__(self, num, interval):
threading.Thread.__init__(self)
self.thread_num = num
self.i = 0
self.interval = interval
self.thread_stop = False
def run(self): #Overwrite run() method, put what you want the thread do here
while not self.thread_stop:
self.i = self.i+1
print '%d Thread Object(%d), Time:%s\n' %(self.i, self.thread_num, time.ctime())
time.sleep(self.interval)
def stop(self):
self.thread_stop = True
def test():
thread1 = timer(1, 0.01)
thread2 = timer(2, 0.2)
thread1.start()
thread2.start()
newsspider()
time.sleep(5)
thread1.stop()
thread2.stop()
return
if __name__ == '__main__':
#test()
ns = newsspider('yousheng.xml')
#ns.run()
ns.close()
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import urllib2
import time
import random
import os.path
import urllib
from log import *
class curl:
# 链接表
urlList = {}
req = None
#字符编码处理
def mdcode(self, data, url=''):
# code = chardet.detect(data)
#return data.decode(code['encoding'])
for c in ('utf-8', 'gbk', 'gb2312'):
try:
return data.decode(c)
except Exception, e:
logging.error(u'编码出错: '+url+', --'+e.message)
def read(self,url, config={}):
try:
url = urllib.unquote(url)
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0'}
self.req = urllib2.Request(url,headers=header)
# 添加头信息
for key in config:
self.req.add_header(key, config[key])
res = urllib2.urlopen(self.req)
html = res.read()
res.close()
return self.mdcode(html, url)
except Exception, e:
logging.error(u'获取HTML失败:'+url+'--'+e.message)
def getFileName(self):
return time.strftime('%y%m%d%H%I',time.localtime(time.time()))+'-'+ str(random.randint(10,99))+'-'+str(random.randint(10,99))
def down(self,url, path=''):
ext = os.path.splitext(url)[-1]
socket = urllib2.urlopen(url)
data = socket.read()
fileName =self.getFileName()+ext
with open( './images/'+fileName, "wb") as jpg:
jpg.write(data)
socket.close()
return '/uploads/images/'+fileName
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import hashlib
import sqlite3
from log import *
'''
链接管理
'''
class linkdb:
def __init__(self, dbname):
try:
self.conn = sqlite3.connect(':memory:')
# 创建临时表
self.cur = self.conn.cursor()
# 创建内存表
self.cur.execute('''CREATE TABLE IF NOT EXISTS `links` (
`id` INTEGER PRIMARY KEY AUTOINCREMENT,
`link` varchar(300) NOT NULL,
`web_name` varchar(16) NOT NULL,
`md5` varchar(32) NOT NULL,
`status` tinyint(1) NOT NULL DEFAULT '0'
);''')
self.cur.execute('''CREATE INDEX IF NOT EXISTS status on links (status);''')
self.cur.execute('''CREATE INDEX IF NOT EXISTS md5 on links (md5);''')
self.cur.execute('''CREATE INDEX IF NOT EXISTS web_name on links (web_name);''')
#self.conn.commit()
except Exception, e:
logging.error(u'数据库初始化失败:'+e.message)
exit(0)
def get_url(self, web_name):
try:
self.cur.execute("SELECT * FROM links WHERE web_name = %s AND status=0", web_name)
return self.cur.fetchone()
except Exception, e:
logging.error(u'获取链接失败:'+web_name+':'+e.message)
exit(0)
def check_url(self, url):
try:
md5 = hashlib.md5(url).hexdigest()
return self.cur.execute("SELECT * FROM links WHERE `md5`=%s", md5)
except Exception, e:
logging.error(u'链接验证失败'+url+':'+e.message)
def add_url(self, url, web_name):
try:
md5 = hashlib.md5(url).hexdigest()
self.cur.execute("INSERT INTO links(`link`, `web_name`, `md5`)VALUES(%s, %s, %s)", [url, web_name, md5])
#self.conn.commit()
except Exception, e:
logging.error(u'链接添加失败'+url+', '+web_name+':'+e.message)
def update_url(self, id):
try:
self.cur.execute("UPDATE links SET status = 1 WHERE id=%s", id);
#self.conn.commit()
return True
except Exception,e:
logging.error(u'链接更新失败'+id+':'+e.message)
def close(self):
try:
self.cur.close()
self.conn.close()
except Exception, e:
logging.error(u'数据库关闭失败'+id+':'+e.message)
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',datefmt='%Y-%m-%d %H:%M:%S', filename='system.log',filemode='a+')
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import lxml
import lxml.etree
from lxml.html.clean import Cleaner
import re
from log import *
class match:
'''
修复HTML
创建XPATH对象
'''
def __init__(self, html, url):
cleaner = Cleaner(style=True, scripts=True,page_structure=False, safe_attrs_only=False)
html = cleaner.clean_html(html)
del cleaner
self.etree = lxml.html.fromstring(html)
self.etree.make_links_absolute( base_url=url, resolve_base_href=True)
'''
获取所有可以匹配链接
'''
def get_all_links(self, link_match, url):
links = []
all_links = self.etree.xpath('//a')
for match in link_match:
regLink = re.compile(url + match.get('value'))
for a in all_links:
href = a.get('href')
if href == None:
continue
if regLink.match(href) != None:
links.append(href)
del all_links
return links
'''
验证是否是详细页链接
'''
def check_info_link(self, link_match, url):
for match in link_match:
regLink = re.compile(match.get('value'))
if regLink.match(url) != None:
return True
else:
return False
def match_info(self, match):
try:
retrun_data = {}
for param in match:
# 匹配当前配置项
name = param.get('name')
if name == None:
logging.error(u'无法获取字段名')
return
param_tree = lxml.html.fromstring(lxml.etree.tostring(param))
# 匹配单独项所有规则
for node in param_tree.xpath('//parsers/parser'):
xpath = node.get('xpath')
type = node.get('type')
info_tree = self.etree.xpath(xpath)
try:
if type == None:
logging.error('字段数据类型获取不到'+name)
# 纯文本字段
elif type == 'text':
retrun_data[name] = ''
for val in info_tree:
retrun_data[name] = val.strip()
elif type == 'text_array':
arr = []
retrun_data[name] = arr
for val in info_tree:
if val.strip() == '':
continue
arr.append(val.strip())
retrun_data[name] = arr
elif type == 'html':
retrun_data[name] = ''
for val in info_tree:
infohtml = lxml.etree.tostring(val,encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b|\bspan\b|\bimg\b))+\b\s*[^>]*>|[\s\r\n\t]+')
infohtml = reg.sub(' ',infohtml).strip()
retrun_data[name] = infohtml
except Exception, e:
print e.message;
except Exception, e:
logging.error(u'获取详细信息失败: '+e.message())
def close(self):
del self.etree
2013-05-09 07:22:21 index.py[line:21] INFO -----------------------------------------------------------------------------
2013-05-09 07:22:21 index.py[line:22] INFO 创建newsspider对象: yousheng.xml
2013-05-09 07:22:21 index.py[line:61] INFO 开始执行配置文件: yousheng.xml
2013-05-09 07:22:22 curl.py[line:23] ERROR 编码出错: http://www.tingchina.com/, --
2013-05-09 07:22:22 index.py[line:101] INFO 执行完毕:
<?xml version="1.0" encoding="utf-8"?>
<root>
<site siteName="ffdy" readMode="normal" url="http://www.tingchina.com/" daily="0.3" charset="utf-8">
<linkRules>
<rule type="reg" value="(yousheng)" />
</linkRules>
<targets>
<target name="info">
<urlRules>
<rule type="reg" value=".*/yousheng/disp_(\d+).htm" />
</urlRules>
<model dataType="array">
<field name="title">
<parsers>
<parser type="text" xpath="//h1/text()" />
</parsers>
</field>
<field name="url">
<parsers>
<parser type="pageurl" xpath="//h1/text()" />
</parsers>
</field>
<field name="detail_pic">
<parsers>
<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
</parsers>
</field>
<field name="director">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="leading">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="type">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="area">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="show_day">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期:']/../td[last()]/text()" code="u" />
</parsers>
</field>
<field name="comment">
<parsers>
<parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
</parsers>
</field>
</model>
</target>
</targets>
</site>
</root>
/.idea
*backup
*.bak
*tpl.php
/.*
*.zip
*.pyc
{"comment": ["\u4eba\u5230\u4e2d\u5e74\u7684\u7b80\u5357\u4fca\u662f\u4e2a\u4e0d\u592a\u6210\u529f\u7684\u751f\u610f\u4eba\uff0c\u62e5\u6709\u4e00\u4e2a\u5178\u578b\u7684\u4e2d\u4ea7\u9636\u7ea7\u5bb6\u5ead\uff0c\u4ed6\u4e0e\u59bb\u5b50\u548c\u4e24\u4e2a\u5b69\u5b50\uff0c\u4ee5\u53ca\u5cb3\u6bcd\u4f4f\u5728\u53f0\u5317\u4e00\u95f4\u666e\u901a\u516c\u5bd3\u623f\u5b50\u91cc\u3002\u59bb\u5b50\u662f\u4e00\u4e2a\u8106\u5f31\u7684\u5973\u4eba\uff0c\u56e0\u4e3a\u6bcd\u4eb2\u7684\u75c5\u800c\u5fc3\u529b\u4ea4\u7601\u3002\u5c0f\u513f\u5b50\u513f\u5b50\u53ea\u670910\u5c81\u5374\u975e\u5e38\u65e9\u719f\uff0c\u559c\u6b22\u62cd\u6444\u4eba\u7684\u80cc\u5f71\u548c\u63d0\u95ee\u54f2\u5b66\u95ee\u9898\u3002\u5927\u5973\u513f\u662f\u4e00\u4e2a\u97f3\u4e50\u5b66\u751f\uff0c\u56e0\u9677\u5165\u4e86\u9519\u8bef\u7684\u7231\u60c5\u800c\u5f00\u59cb\u5c1d\u5230\u4eba\u751f\u7684\u82e6\u6da9\u3002\u4e00\u6b21\u5728\u5c0f\u8205\u5b50\u7684\u5a5a\u793c\u4e0a\uff0c\u7b80\u5357\u4fca\u9047\u5230\u4e86\u5e74\u8f7b\u65f6\u7684\u5973\u53cb\uff0c\u91cd\u65b0\u71c3\u8d77\u4e86\u4e45\u8fdd\u7684\u7231\u60c5\u2026\u2026", "2000\u5e74\uff0c\u662f\u4e9a\u6d32\u7535\u5f71\u5927\u4e30\u6536\u7684\u4e00\u5e74\uff0c5\u6708\u4e3e\u884c\u7684\u621b\u7eb3\u7535\u5f71\u8282\u51e0\u4e4e\u6210\u4e86\u201c\u4e9a\u6d32\u7535\u5f71\u7684\u8282\u65e5\u201d\uff0c\u5728\u8fd9\u6b21\u7535\u5f71\u8282\u4e0a\uff0c\u300a\u82b1\u6837\u5e74\u534e\u300b\u83b7\u5f97\u4e86\u6700\u4f73\u5f71\u7247\u3001\u6700\u4f73\u7537\u4e3b\u89d2\u4e24\u9879\u5927\u5956\uff0c\u5bfc\u6f14\u738b\u5bb6\u536b\u5927\u51fa\u98ce\u5934\uff0c\u800c\u300a\u4e00\u4e00\u300b\u7684\u5bfc\u6f14\u6768\u5fb7\u660c\u751a\u81f3\u6bd4\u738b\u5bb6\u536b\u66f4\u52a0\u5f15\u4eba\u6ce8\u76ee\uff0c\u56e0\u4e3a\u4ed6\u83b7\u5f97\u4e86\u5c5e\u4e8e\u5bfc\u6f14\u7684\u6700\u9ad8\u8363\u8a89\u2014\u2014\u6700\u4f73\u5bfc\u6f14\u5956\u3002\u4f17\u591a\u7684\u89c2\u4f17\u4e3a\u4ed6\u7684\u8fd9\u90e8\u590d\u6742\u3001\u7ec6\u81f4\u800c\u4f18\u96c5\u7684\u5f71\u7247\u800c\u503e\u5012\uff0c\u5e76\u5bf9\u534e\u8bed\u7535\u5f71\u4ea7\u751f\u4e86\u6781\u5927\u5174\u8da3\u3002\u300a\u4e00\u4e00\u300b\u4e5f\u6210\u529f\u5730\u8fdb\u5165\u4e86\u7f8e\u56fd\u5e02\u573a\uff0c\u6210\u4e3a\u88ab\u7f8e\u56fd\u666e\u901a\u89c2\u4f17\u6240\u770b\u5230\u7684\u7b2c\u4e00\u90e8\u6768\u5fb7\u660c\u5bfc\u6f14\u7684\u5f71\u7247\uff0c\u6210\u4e3a\u4ed6\u7684\u7535\u5f71\u5927\u6b65\u8fc8\u8fdb\u66f4\u5e7f\u9614\u7684\u56fd\u9645\u5e02\u573a\u7684\u7b2c\u4e00\u6b65\u3002"], "title": "\u4e00\u4e00", "url": "http://www.ffdy.cc/movie/10450.html", "leading": ["\u5434\u5ff5\u771f", "\u91d1\u71d5\u73b2", "Issei Ogata", "Kelly Lee (II)", "Jonathan Chang", "Hsi-Sheng Chen", "Su-Yun Ko", "Michael Tao", "\u8427\u6dd1\u614e", "Adrian Lin", "Pang Chang Yu", "Ru-Yun Tang", "Shu-Yuan Hsu", "Hsin-Yi Tseng", "\u9648\u4ee5\u6587", "Tang Congsheng"], "area": "\u4e2d\u56fd\u53f0\u6e7e", "detail_pic": "http://img.kankanba.com/cs/250X350/2/cbe3d833e70d0a44b26ff5cf639fdcc2.jpg", "director": ["\u6768\u5fb7\u660c"], "show_day": "2000-05-14 \u6cd5\u56fd", "type": ["\u5267\u60c5"]}
<?xml version="1.0" encoding="utf-8"?>
<root>
<site siteName="ffdy" url="http://www.ffdy.cc/" daily="0.3" log="ffdy.log" error="ffdy_error.log" charset="utf-8">
<linkRules>
<rule type="reg" value="(type/movie|movie)" />
</linkRules>
<targets>
<target name="info">
<urlRules>
<rule type="reg" value=".*/movie/(\d+).html" />
</urlRules>
<model dataType="array">
<field name="title">
<parsers>
<parser type="text" xpath="//h1/text()" />
</parsers>
</field>
<field name="url">
<parsers>
<parser type="pageurl" xpath="//h1/text()" />
</parsers>
</field>
<field name="detail_pic">
<parsers>
<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
</parsers>
</field>
<field name="director">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="leading">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="type">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="area">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="show_day">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期:']/../td[last()]/text()" code="u" />
</parsers>
</field>
<field name="comment">
<parsers>
<parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
</parsers>
</field>
</model>
</target>
</targets>
</site>
</root>
Wed, 10 Apr 2013 20:10:52 log.py[line:21] INFO 网站读取完成
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import wx
from lxml import etree
from index import main
class DemoFrame(wx.Frame):
def __init__(self):
self.cateList = []
wx.Frame.__init__(self, None, -1, u"load goods",size=(400,200))
self.draw()
def draw(self):
self.panel = wx.Panel(self, -1)
wx.StaticText(self.panel, -1, u"输入网址:", (15, 15))
wx.StaticText(self.panel, -1, u"选择分类:", (15, 50))
sampleList = self.getCate()
self.getCateList(sampleList)
self.cate = wx.ComboBox(self.panel, -1, self.cateList[0], (80, 50), wx.DefaultSize, self.cateList)
self.text = wx.TextCtrl(self.panel,-1,value='',pos=(80,15),size=(300,24))
self.button = wx.Button(self.panel, -1, u"抓取", pos=(15, 90))
self.Bind(wx.EVT_BUTTON, self.OnClick, self.button)
def OnClick(self, event):
self.button.SetLabel(u'抓取中...')
self.button.Enable(False)
index = main()
bool = index.init(self.text.GetValue(),self.cate.GetValue())
if bool:
self.button.SetLabel(u'抓取')
self.button.Enable(True)
def getCateList(self, cate):
for s in cate:
if type(s) == type([]):
self.getCateList(s)
else:
self.cateList.append(s)
def getCate(self, xpath=None, p=''):
ret=[]
if xpath == None:
xtree = etree.parse(open('cate.xml'))
cates = xtree.xpath('/root/cate')
for cate in cates:
row = cate.getchildren()
if row :
ret.append(p+cate.get('name'))
ret.append(self.getCate(row, (p+cate.get('name')+'->')))
else:
ret.append(p+cate.get('name'))
else:
for cate in xpath:
row = cate.getchildren()
if row:
ret.append(p+cate.get('name'))
ret.append(self.getCate(row,(p+cate.get('name')+'->')))
else:
ret.append(p+cate.get('name'))
return ret
app = wx.PySimpleApp()
frame = DemoFrame()
frame.Show()
app.MainLoop()
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
from lxml import etree
from os.path import join, getsize
from model.curl import curl
import csv
import re
import string
import re
header = ("*:通用商品类型","bn:商品货号","ibn:规格货号","col:分类","col:品牌","col:市场价","col:成本价","col:销售价","col:商品名称",
"col:上架","col:规格","price:普通会员","price:高级会员","price:VIP会员","col:缩略图","col:图片文件","col:商品简介",
"col:详细介绍","col:重量","col:单位","col:库存","col:货位","col:大图片","col:小图片" )
class main:
def init(self, url, cate):
self.curl = curl()
html = self.curl.read(url)
#fop = open('./html.html')
#print getsize('./html.html')
#fop.write(html)
#try:
# html = fop.read(getsize('./html.html'))
# #html = self.curl.mdcode(html)
#finally:
# fop.close()
#print html
data = {}
xtree = etree.HTML(html)
# 标题
title = xtree.xpath('//h1')
data['name'] = string.strip(title[0].text)
#价格
price = xtree.xpath('//span[@id="ECS_SHOPPRICE"]')
data['price'] = string.strip(price[0].text)
#原价
oldprice = xtree.xpath('//span[@class="xline"]')
oldprice = re.findall(re.compile('[\d.]*'), string.strip(oldprice[0].text))
data['oldprice'] = oldprice[1]
#品牌
#brand = xtree.xpath('//*[@id="ECS_FORMBUY"]/div/div[3]/span[2]/a')
#data['brand'] = string.strip(brand[0].text)
#货号
huohao = xtree.xpath('//*[@id="ECS_FORMBUY"]/div/p/span[2]')
data['ibn'] = string.strip(huohao[0].text)
#大图片
bimg = xtree.xpath('//*[@id="thumg"]')
imgurl = string.strip(bimg[0].get('src'))
data['bimg'] = self.curl.down(imgurl)
#大图片
data['simg'] = data['bimg']
#详细
dest = xtree.xpath('//div[@class="deszone"]/div[@class="zones"]')
des = etree.tostring(dest[0], encoding='utf-8')
#data['des'] = des
reg = re.compile('\s',re.I)
s = reg.subn(' ', des)
data['des'] = s[0]
data['des'] = data['des'].replace( 'src2','src')
#print data['des']
#下载所有图片
#ireg = re.compile("<img\b[^<>]*?\bsrc[2\s\t\r\n]*=[\s\t\r\n]*['\"]?[\s\t\r\n]*(\?<imgUrl>[^\s\t\r\n'\"<>]*)[^<>]*?/?[\s\t\r\n]*>")
imgreg = re.compile(r"<img\b[^<>]*?\bsrc[2\s\t\r\n]*=[\s\t\r\n]*['\"]?[\s\t\r\n]*([^\s\t\r\n'\"<>]*)[^<>]*?/?[\s\t\r\n]*>")
ilist = imgreg.findall(data['des'])
for img in ilist:
try:
print u'下载'+img
new = self.curl.down(img)
data['des'] = data['des'].replace( img,new)
except:
print u'下载失败'+img
header = ("*:通用商品类型","bn:商品货号","ibn:规格货号","col:分类","col:品牌","col:市场价","col:成本价","col:销售价","col:商品名称",
"col:上架","col:规格","price:普通会员","price:高级会员","price:VIP会员","col:缩略图","col:图片文件","col:商品简介",
"col:详细介绍","col:重量","col:单位","col:库存","col:货位","col:大图片","col:小图片")
#print cate
#cate = ''
# 拼字段
row = (self.curl.mdcode('通用商品类型'), self.curl.mdcode(data['ibn']),'',self.curl.mdcode(cate),'',self.curl.mdcode(data['oldprice']),self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['name']),'Y','',self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['price']),self.curl.mdcode(data['simg']),self.curl.mdcode(data['bimg']),'',self.curl.mdcode(data['des']),'0.000','','','',self.curl.mdcode(data['bimg']),self.curl.mdcode(data['simg']))
fop = open('tmp.csv','w+')
writer = csv.writer(fop)
writer.writerow(header)
writer.writerow(row)
print u'完成'
fop.close()
return True
#mai = main()
#mai.init()
#curls = curl()
#curls.down('http://www.msex.com/static/upload/1303121657296625.jpg',{})
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import urllib2
import time
import random
import os.path
import urllib
from log import log
class curl:
# 链接表
urlList = {}
req = None
#字符编码处理
def mdcode(self, data):
# code = chardet.detect(data)
#return data.decode(code['encoding'])
for c in ('utf-8', 'gbk', 'gb2312'):
try:
return data.decode(c)
except:
pass
#
# for c in ('utf-8', 'gbk', 'gb2312'):
# try:
# return data.encode( 'utf-8' )
# except:
# pass
#
# return data
#
def getBaseUrl(self, base_url, link):
print ''
def read(self,url, config={}):
try:
url = urllib.unquote(url)
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0'}
self.req = urllib2.Request(url,headers=header)
# 添加头信息
for key in config:
self.req.add_header(key, config[key])
res = urllib2.urlopen(self.req)
html = res.read()
res.close()
# code = chardet.detect(html)
return self.mdcode(html)
except:
print u'获取HTML失败'
return ''
def getFileName(self):
return time.strftime('%y%m%d%H%I',time.localtime(time.time()))+'-'+ str(random.randint(10,99))+'-'+str(random.randint(10,99))
def down(self,url):
ext = os.path.splitext(url)[-1]
socket = urllib2.urlopen(url)
data = socket.read()
fileName =self.getFileName()+ext
with open( './images/'+fileName, "wb") as jpg:
jpg.write(data)
socket.close()
return '/uploads/images/'+fileName
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import MySQLdb
import hashlib
db_host = '127.0.0.1'
db_name = 'root'
db_passwd = 'LEsc2008'
db_dbname = 'python'
db_port = 3306
class db:
#self.conn = None
def __init__(self):
try:
self.conn = MySQLdb.connect(host=db_host,user=db_name,passwd=db_passwd,port=db_port,use_unicode=True, charset='utf8')
self.cur = self.conn.cursor()
#print self.cur
'''创建数据库 如果数据库不存在'''
#count = self.cur.execute("create database if not exists %s", db_dbname)
#print count
self.conn.select_db(db_dbname)
#self.cur.execute("SET NAMES utf8")
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
'''
获取网站下连接
'''
def get_url(self, web_name):
self.cur.execute("SELECT * FROM links WHERE web_name = %s AND status=0", web_name)
return self.cur.fetchone()
'''
持久化连接
'''
def add_url(self, link, web_name):
md5 = hashlib.md5(link).hexdigest()
print link
self.cur.execute("INSERT INTO links(`link`, `web_name`, `md5`)VALUES(%s, %s, %s)", [link, web_name, md5])
self.conn.commit()
'''
检测连接是否存在
'''
def check_url(self, link):
md5 = mdb5 = hashlib.md5(link).hexdigest()
return self.cur.execute("SELECT * FROM links WHERE `md5`=%s", md5)
#return self.cur.fetchone()
def update_url(self, id):
self.cur.execute("UPDATE links SET status = 1 WHERE id=%s", id);
self.conn.commit()
return True
def add_star(self, director):
#print director
count = self.cur.execute("SELECT id FROM star WHERE name=%s", director)
if count == 0:
self.cur.execute("INSERT INTO star(name)VALUES(%s)",director)
id = self.conn.insert_id()
self.conn.commit()
return str(id)
else:
star = self.cur.fetchone()
return str(star[0])
def addData(self,data):
#print data
### 增加导演
director = ''
try:
for daoyan in data['director']:
director += ','+self.add_star(daoyan)
director = director.strip(',')
except:
director = ''
### 增加主演
leading = ''
try:
for lead in data['leading']:
leading += ','+self.add_star(lead)
leading = leading.strip(',')
except:
leading = ''
### 简介
comment = ''
try:
for comm in data['comment']:
comment += comm
except:
comment = '';
#标题, 图片, 链接
inserData = [data['title'], data['detail_pic'],data['url'],director,leading,data['area'],data['show_day'],comment]
self.add_movie(inserData)
def add_movie(self,insertData):
self.cur.execute("INSERT INTO movie(`title`,`img`,`url`,`director`,`leading`,`area`,`show_day`,`comment`)VALUES(%s, %s, %s, %s, %s, %s, %s, %s)", insertData)
self.conn.commit()
'''
关闭数据库
'''
def close(self):
try:
self.cur.close()
self.conn.close()
except:
pass
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import logging
class log:
@staticmethod
def read(file):
try:
fopen = open(file, 'r')
data = fopen.read()
fopen.close()
return data
except:
pass
@staticmethod
def write(file, logs):
try:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',datefmt='%a, %d %b %Y %H:%M:%S', filename=file,filemode='w')
logging.info(logs)
except Exception, e:
print '-----------', Exception, e
pass
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
import lxml
import lxml.etree
from lxml.html.clean import Cleaner
import re
class match:
'''
修复HTML
创建XPATH对象
'''
def __init__(self, html, url):
cleaner = Cleaner(style=True, scripts=True,page_structure=False, safe_attrs_only=False)
html = cleaner.clean_html(html)
del cleaner
self.etree = lxml.html.fromstring(html)
self.etree.make_links_absolute( base_url=url, resolve_base_href=True)
'''
获取所有可以匹配链接
'''
def get_all_links(self, link_match, url):
links = []
all_links = self.etree.xpath('//a')
for match in link_match:
regLink = re.compile(url+match.get('value'))
for a in all_links:
try:
href = a.get('href')
except:
continue;
if regLink.match(href) != None:
links.append(href)
#else:
#print '失败', a.get('href')
del all_links
return links
'''
获取所有需要查询的信息
'''
def get_match_info(self, match, url=None):
try:
data = {}
for param in match:
name = param.get('name')
ntree = lxml.html.fromstring(lxml.etree.tostring(param))
#
node = ntree.xpath('//parsers/parser')[0]
xpath = node.get('xpath')
infoxpath = self.etree.xpath(xpath)
try:
nodetype = node.get('type')
if nodetype == 'text':
data[name] = infoxpath[0].strip()
elif nodetype == 'array':
arr = []
for item in infoxpath:
if item.strip() == '':
continue;
arr.append(item.strip())
data[name] = arr
elif nodetype == 'pageurl':
data[name] = url
elif nodetype == 'html':
infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b|\bspan\b|\bimg\b))+\b\s*[^>]*>|[\s\r\n\t]+')
infohtml = reg.sub(' ',infohtml).strip()
data[name] = infohtml
except:
data[name] = ''
print name,u'读取不出来'
continue
except:
print xpath,u'读取不出来'
return data
def match_tiantang(self, match, url):
try:
data = {}
for param in match:
name = param.get('name')
ntree = lxml.html.fromstring(lxml.etree.tostring(param))
#
node = ntree.xpath('//parsers/parser')[0]
xpath = node.get('xpath')
infoxpath = self.etree.xpath(xpath)
try:
nodetype = node.get('type')
if nodetype == 'text':
data[name] = infoxpath[0].strip()
elif nodetype == 'array':
arr = []
for item in infoxpath:
if item.strip() == '':
continue;
arr.append(item.strip())
data[name] = arr
elif nodetype == 'pageurl':
data[name] = url
elif nodetype == 'html':
infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
infohtml = reg.sub(' ',infohtml).strip()
data[name] = infohtml
except:
data[name] = ''
print name,u'读取不出来'
continue
except:
#log.write('system.log',xpath+u'读取不出来')
print xpath,u'读取不出来'
'''
获取所有需要查询的信息
'''
def get_match_info_test(self, match, url=None):
try:
data = {}
for param in match:
name = param.get('name')
ntree = lxml.html.fromstring(lxml.etree.tostring(param))
#
node = ntree.xpath('//parsers/parser')[0]
xpath = node.get('xpath')
infoxpath = self.etree.xpath(xpath)
try:
nodetype = node.get('type')
if nodetype == 'text':
if infoxpath != []:
data[name] = infoxpath[0].strip()
else:
data[name] = ''
elif nodetype == 'array':
arr = []
if infoxpath == []:
data[name] = arr
else:
for item in infoxpath:
if item.strip() == '':
continue;
arr.append(item.strip())
data[name] = arr
elif nodetype == 'pageurl':
data[name] = url
elif nodetype == 'html':
if infoxpath == []:
data[name] = ''
else:
infohtml = lxml.etree.tostring(infoxpath[0],encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
infohtml = reg.sub(' ',infohtml).strip()
data[name] = infohtml
except:
data[name] = ''
print name,u'读取不出来'
continue
except:
#log.write('system.log',xpath+u'读取不出来')
print xpath,u'读取不出来'
return data
# print self.etree.xpath('//h1/text()')[0]
# print self.etree.xpath('//h1/em/text()')[0]
# com = self.etree.xpath("//div[@class='filmcontents']/node()/text()|//div[@class='filmcontents']/text()")
# s = ''
# for c in com:
# s = s+ c
# print s
# #规则学习
# d = self.etree.xpath(u"//div[@class='detail_intro']/table/tr/td[text()='上映日期:']/../td[last()]/text()")
# print d[0]
def close(self):
del self.etree
Mon, 08 Apr 2013 22:17:20 log.py[line:25] INFO 网站读取完成
# mysetup.py
from distutils.core import setup
import py2exe
setup(options = {"py2exe":{"dll_excludes":["MSVCP90.dll",'lxml.dll'], }},windows=[{"script": "frame.py"}])
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
from lxml import etree
from model.db import db
from model.curl import curl
from model.match import match
from model.log import log
import re
import time
i = 1
'''
爬虫
'''
class spiderling:
def __init__(self, config):
self.i = 0
try:
configtree = etree.ElementTree(file=config)
# 获取网站属性
sites = configtree.xpath('//site')
site = sites[0]
self.url = site.get('url')
self.site_name = site.get('siteName')
self.daily = float(site.get('daily'))
self.log = site.get('log')
self.errlog = site.get('error')
self.linkRule = configtree.xpath('//linkRules/rule')
self.infoUrlRule = configtree.xpath('//urlRules/rule')
self.infoRule = configtree.xpath('//targets/target/model/field')
except:
log.write('error.log', u'配置文件读取错误')
self.db = db()
def run(self, url):
#休息时间
time.sleep(self.daily)
if url == None:
info = self.db.get_url(self.site_name)
if info == None:
log.write(self.log, u'网站读取完成')
return 0;
self.db.update_url(info[0])
url = info[1]
gurl = curl()
html = gurl.read(url)
try:
if html.strip() == '':
s = None;
self.run(s)
except Exception, e:
log.write(self.log, url+u' html 获取失败'+Exception+e)
s = None;
self.run(s)
#print html
self.xtree = match(html, url)
links = self.xtree.get_all_links(self.linkRule, self.url)
'''把获取到的连接持久化'''
for link in links:
if self.db.check_url(link) == 0:
self.db.add_url(link, self.site_name)
'''如果当前连接是详细页则正则所需内容'''
#for infoxpath in self.infoRule:
#self.xtree.get_match_info(self.infoRule)
regInfoLink = re.compile(self.infoUrlRule[0].get('value'))
if regInfoLink.match(url) <> None:
self.i = self.i+1
data = self.xtree.get_match_info(self.infoRule, url)
self.db.addData(data)
#
# file_object = open(str(self.i)+'id.txt', 'w')
# file_object.write(json.dumps(data))
# file_object.close()
#
#print json.dumps(data)
else:
print u'不是详细也不需要解析'
s = None
self.run(s)
def close(self):
try:
self.xtree.close()
except:
pass
try:
self.db.close()
except:
pass
sp = spiderling('cate.xml')
#return
sp.run(sp.url)
#sp.run('http://www.ffdy.cc/movie/35622.html')
sp.close()
#import sqlite3 #导入模块
#cx = sqlite3.connect("d:\\test.db")
#
#cu=cx.cursor()
##cu.execute("""create table catalog ( id integer primary key, pid integer, name varchar(10) UNIQUE )""")
##
##cu.execute(u"insert into catalog values(2, 0, '哈哈')")
##cu.execute(u"insert into catalog values(3, 0, '我是中国')")
##cx.commit()
#
#cu.execute("select * from catalog")
#d = cu.fetchall()
#for s in d:
# print s[2]
#cu.close()
#cx.close()
# -*- coding: utf-8 -*-
__author__ = 'ShengYue'
from lxml import etree
from model.db import db
from model.curl import curl
from model.match import match
import re
import time
import lxml
i = 1
'''
爬虫
'''
class spiderling:
def __init__(self, config):
self.i = 0
configtree = etree.ElementTree(file=config)
site = configtree.xpath('//site')
self.url = site[0].get('url')
self.site_name = site[0].get('siteName')
self.linkRule = configtree.xpath('//linkRules/rule')
self.infoUrlRule = configtree.xpath('//urlRules/rule')
self.infoRule = configtree.xpath('//targets/target/model/field')
#print self.linkRule[0].get('value')
self.db = db()
def run(self, url):
time.sleep(0.3)
if url == None:
info = self.db.get_url(self.site_name)
if info == None:
print u'爬虫完成'
return 0;
self.db.update_url(info[0])
url = info[1]
gurl = curl()
html = gurl.read(url)
try:
if html.strip() == '':
s = None;
self.run(s)
except:
s = None;
self.run(s)
#print html
self.xtree = match(html, url)
d = self.xtree.etree.xpath("//div[@class='filmcontents']")
sd = etree.tostring(d[0],encoding="utf-8",method="html")
sd = sd.strip()
print sd
print '================================='
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b))+\b\s*[^>]*>|[\s\r\n\t]+')
ds = reg.sub(' ',sd).strip()
print ds
return
links = self.xtree.get_all_links(self.linkRule, self.url)
'''把获取到的连接持久化'''
for link in links:
if self.db.check_url(link) == 0:
self.db.add_url(link, self.site_name)
'''如果当前连接是详细页则正则所需内容'''
#for infoxpath in self.infoRule:
#self.xtree.get_match_info(self.infoRule)
regInfoLink = re.compile(self.infoUrlRule[0].get('value'))
if regInfoLink.match(url) <> None:
self.i = self.i+1
print u'是详细页需要解析', str(self.i)
data = self.xtree.get_match_info_test(self.infoRule, url)
print u'插入数据', url
self.db.addData(data)
#
# file_object = open(str(self.i)+'id.txt', 'w')
# file_object.write(json.dumps(data))
# file_object.close()
#
#print json.dumps(data)
else:
print u'不是详细也不需要解析'
s = None
self.run(s)
def close(self):
self.xtree.close()
self.db.close()
#sp = spiderling('cate.xml')
#sp.run(sp.url)
#sp.run('http://www.ffdy.cc/movie/35622.html')
#sp.close()
url = 'http://www.dytt8.net/html/gndy/dyzz/20130407/41866.html'
curls = curl()
html = curls.read(url,{})
xtree = match(html, url)
content = xtree.etree.xpath('//div[@id="Zoom"]')
infohtml = lxml.etree.tostring(content[0],encoding="utf-8",method="html")
infohtml = infohtml.strip()
reg = re.compile(r'<[!/]?\b(?!(\bpre\b|\bli\b|\bp\b|\bbr\b|\bspan\b|\bimg\b|\ba\b))+\b\s*[^>]*>')
infohtml = reg.sub(' ',infohtml).strip()
pattern = re.compile(r'◎年  代 ([^<]*)')
ds= pattern.search(html)
print
if(ds==[]):
print u'找不到'
else:
print ds[0]
print infohtml
<?xml version="1.0" encoding="utf-8"?>
<root>
<site siteName="ffdy" url="http://www.ffdy.cc/" daily="0.3" log="ffdy.log" error="ffdy_error.log" charset="utf-8">
<linkRules>
<rule type="reg" value="(type/movie|movie)" />
</linkRules>
<targets>
<target name="info">
<urlRules>
<rule type="reg" value=".*/movie/(\d+).html" />
</urlRules>
<model dataType="array">
<field name="title">
<parsers>
<parser type="text" xpath="//h1/text()" />
</parsers>
</field>
<field name="url">
<parsers>
<parser type="pageurl" xpath="//h1/text()" />
</parsers>
</field>
<field name="detail_pic">
<parsers>
<parser type="text" xpath="//div[@class='detail_pic']/span/img/@src" />
</parsers>
</field>
<field name="director">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='导演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="leading">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='主演:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="type">
<parsers>
<parser type="array" xpath="//div[@class='detail_intro']/table/tr/td[text()='类型:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="area">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='国家/地区:']/../td[last()]/a/text()" code="u" />
</parsers>
</field>
<field name="show_day">
<parsers>
<parser type="text" xpath="//div[@class='detail_intro']/table/tr/td[text()='上映日期:']/../td[last()]/text()" code="u" />
</parsers>
</field>
<field name="comment">
<parsers>
<parser type="html" xpath="//div[@class='filmcontents']" reg="u" />
</parsers>
</field>
</model>
</target>
</targets>
</site>
</root>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment