Skip to content

Instantly share code, notes, and snippets.

@medcl
Last active August 29, 2015 14:02
Show Gist options
  • Save medcl/5ed9c542a1ac8e001988 to your computer and use it in GitHub Desktop.
Save medcl/5ed9c542a1ac8e001988 to your computer and use it in GitHub Desktop.
#string should not around ""
#regex no need escape \
[DEFAULT]
ArrayStringSplitter: ,
GoProfEnabled: false
SplitByUrlParameter: p,pn,pid
[cluster]
name: Gopa
[fetch]
shard: 1-10
[Global]
MaxGoRoutine: 2
StoreWebPageTogether: true
[Switch]
ParseUrlsFromSavedFileLog: true
LoadTemplatedFetchJob: false
LoadPendingFetchJobs: true
LoadRuledFetchJob: true
ParseUrlsFromPreviousSavedPage: false
HttpEnabled: true
[BloomFilter]
ItemSize: 100000
[CrawlerRule]
#Name: GopaTask1 #货源
Name: GopaTask2 #车源
#------ExtractLinksFromPage-----------
SkipPageParsePattern: .*?\\.((js)|(css)|(rar)|(gz)|(zip)|(exe)|(bmp)|(jpeg)|(gif)|(png)|(jpg)|(apk))\\b
LinkUrlExtractRegex: <a href="(/\d+/\d+.html)" onclick="ClickCheline #车源
#LinkUrlExtractRegex: <a target="_blank" rel="nofollow" href="(/\d+/\d+.html)" > #货源
LinkUrlExtractRegexGroupIndex: 1
FetchDelayThreshold: 800 #800ms
[RuledFetch]
#UrlTemplate: http://www.chinawutong.com/103.html?pid={id}&f=&t=&lx=&hk=&hl= #货源
UrlTemplate: http://www.chinawutong.com/102.html?pid={id} #车源
From:0
To:550
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment