Skip to content

Instantly share code, notes, and snippets.

@nobodyzxc
Last active July 30, 2016 06:12
Show Gist options
  • Save nobodyzxc/cf947fd4b7d0f6362a29c9271f19a2bc to your computer and use it in GitHub Desktop.
Save nobodyzxc/cf947fd4b7d0f6362a29c9271f19a2bc to your computer and use it in GitHub Desktop.
python scrapy files
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
char str[1000];
int back(char *tail){
char *head;
for(head = tail ; ; head--)
if(*head == ';' || *head == '>')
break;
for(head += 1 ; head < tail ; head++)
printf("%c" , *head);
printf(" ");
return 0;
}
int main(int argc , char *argv[]){
if(argc != 2){
puts("Wrong argc");
return -1;
}
int i , cnt = atoi(argv[1]);//cnt is column num
char *tail;
while(fgets(str , sizeof(str) , stdin)){
for(i = 0 ; i < cnt ; i++)
if(tail = strstr(str , "</td>"))
back(tail) , strncpy(tail , "ignore" , 6);
else
break;
puts("");
}
return 0;
}
import scrapy
class RateItem(scrapy.Item):
currency = scrapy.Field()
value = scrapy.Field()
pass
class RatePipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
with open('rate.txt' , 'w+') as file:
file.write('%20s %10s %10s %10s %10s' %('currency' , 'cash_in' , 'cash_out' , 'sight_in' , 'sight_out'))
file.write('\n')
i = 0
for country in item['currency'][::]:
file.write( '{0:->25}'.format(str(country.encode('utf-8')))) ,
for j in range(4):
file.write('%10s ' % item['value'][i + j]),
file.write('\n');
i += 4
return item
#!/bin/bash
URL="http://rate.bot.com.tw/Pages/Static/UIP003.zh-TW.htm"
COLS=10 #how many columns do you want to fetch...
C_FILE="fetch.c"
if [ $# == 1 ];then
case $1 in
"h" | "help" | "-h" | "--help")
echo "scrapy [ URL ]"
exit;;
*)
URL=$1;;
esac
fi
wget -O bank_rate.html $URL 2>/dev/null
if [ $? -eq 0 ];then
if [ ! -f fetch.exe ];then
gcc -o fetch.exe $C_FILE
if [ ! $? -eq 0 ];then
echo "compile fetch.c failed"
exit
fi
fi
grep decimal bank_rate.html | ./fetch.exe $COLS > rate.txt
cat rate.txt
rm -f bank_rate.html fetch.exe rate.txt
else
echo "download file failed"
fi
import scrapy
import sys
from rate.items import RateItem
class RateSpider(scrapy.Spider):
name = "rate_spider"
allowed_domains = ["bot.com.tw"]
start_urls = ['http://rate.bot.com.tw/Pages/Static/UIP003.zh-TW.htm']
def parse(self, response):
item = RateItem()
all_data = response.xpath('//*[@id="slice1"]');
item['currency'] = all_data.css('td.titleLeft::text').extract()
item['value'] = all_data.css('td.decimal::text').extract()
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment