Skip to content

Instantly share code, notes, and snippets.

View wangkangdegithub's full-sized avatar

Bo_hemian wangkangdegithub

View GitHub Profile
@wangkangdegithub
wangkangdegithub / 行情.py
Created March 15, 2017 03:35
对数据表某列数据one-hot,同时把两个数据表拼接成一个表
# coding: utf - 8
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import csv
myfont = fm.FontProperties(fname='C:/Windows/Fonts/msyh.ttc')
data1= pd.read_csv("C:/Users/Bohemian/Desktop/salerecord(2).csv")
@wangkangdegithub
wangkangdegithub / 相关性.py
Created March 15, 2017 03:33
用python 做相关性验证,T值,R值,P值
# coding: utf - 8
import pandas as pd
from scipy import stats
from scipy.stats import ttest_ind
data_train = pd.read_csv('C:/Users/Bohemian/Desktop/dayuecheng.csv')
print (data_train.head(10))
x = data_train['keliuliang']
y = data_train['jindiankeliu']
z = data_train['jdxfcs']
@wangkangdegithub
wangkangdegithub / python charts.py
Created March 15, 2017 03:31
1.从Mysql数据库中导入数据,此时print的数据是series 数据结构,然而我们在Python中分析的都是dataframe数据结构,因此需要把series转换为dataframe。
# 本段代码执行以下功能:1.从Mysql数据库中导入数据,此时print的数据是series 数据结构,然而我们在Python中分析的都是dataframe数据结构,因此需要把series转换为dataframe。
import pymysql
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
#import charts
db = pymysql.connect("localhost",user="root",passwd="root",db="zhkj",charset="utf8")
cursor = db.cursor(cursor=pymysql.cursors.DictCursor)
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
myfont = fm.FontProperties(fname='C:/Windows/Fonts/msyh.ttc')
data_train = pd.read_csv("C:/Users/Bohemian/Desktop/titanic/train.csv")
#print (data_train)
#print (data_train.describe())
@wangkangdegithub
wangkangdegithub / 58同城 电话号码获取.py
Created November 19, 2016 13:31
58同城 电话号码信息获取
from bs4 import BeautifulSoup
import requests,urllib.request,pymongo
client = pymongo.MongoClient('localhost', 27017)
TC = client['58TC']
sheet_58 = TC['sheet_58']
def get_more_page(start, end):
urls= ['http://bj.58.com/shoujihao/pn{}/'.format(str(i)) for i in range(start,end)]
for url in urls:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
@wangkangdegithub
wangkangdegithub / 58同城.py
Created November 19, 2016 13:31
58同城 商品详细信息
from bs4 import BeautifulSoup
import requests,time
page_links = []
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36','Cookie':'f=n; ipcity=qd%7C%u9752%u5C9B; f=n; id58=c5/nn1c9J+syWfPhEuVGAg==; mcity=zz; als=0; hots=%5B%7B%22d%22%3A0%2C%22s1%22%3A%22ipad%20%E7%88%B1%E4%BA%BA%E3%80%81%22%2C%22s2%22%3A%22%22%2C%22n%22%3A%22sou%22%7D%5D; __utma=253535702.1008435073.1478163262.1478163262.1478163262.1; __utmz=253535702.1478163262.1.1.utmcsr=zz.58.com|utmccn=(referral)|utmcmd=referral|utmcct=/; myfeet_tooltip=end; bj58_id58s="UExrSDM4aVFmc089NTk4Mw=="; city=zz; 58home=zz; sessionid=4eb22971-5459-4520-b22c-46b75796fe78; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=4; 58tj_uuid=09dbb43f-9f6c-4711-987a-b2582099fb07; new_session=0; new_uv=5; utm_source=; spm=; init_refer='}
# 获得从第几页到第几页的所有详情页信息
def get_more_page(start, end):
urls = ['http://bj.58.com/pbdn/0/pn{}'.format(str(i)) for i in
range(start, end)]
f
@wangkangdegithub
wangkangdegithub / jiandan.py
Created November 19, 2016 13:28
煎蛋 图片批量下载
from bs4 import BeautifulSoup
import requests,urllib.request,time
## 问题:此程序虽然进行反反爬虫措施,但效果仍然不好。并且“+item[-num:]”中num的设置很有问题,设置过大,出现字母时爬虫会自动结束,需要修改:try: except?
url = 'http://jandan.net/ooxx/page-'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Cookie':'gif-click-load=off; bad-click-load=off; _gat=1; jdna=596e6fb28c1bb47f949e65e1ae03f7f5#1478358545174; Hm_lvt_fd93b7fb546adcfbcf80c4fc2b54da2c=1478338211,1478358498; Hm_lpvt_fd93b7fb546adcfbcf80c4fc2b54da2c=1478358546; _ga=GA1.2.666641045.1478338213'}
def get_attractions(url):
wb_data = requests.get(url,headers = header)
@wangkangdegithub
wangkangdegithub / ganji_channel_extract.py
Last active November 19, 2016 10:06
赶集网 多线程抓取详情页
# 从赶集首页获取商品类目链接:
from bs4 import BeautifulSoup
import requests,pymongo
start_url = 'http://bj.ganji.com/wu/'
url_host = 'http://bj.ganji.com'
def get_channel(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
links = soup.select('.fenlei > dt > a')
@wangkangdegithub
wangkangdegithub / IT_ITcharts.py
Created November 19, 2016 09:35
对IT橘子网站设计爬虫,抓取河南所有公司的获投状态,并制作图表
import pymongo
import charts
client = pymongo.MongoClient('localhost', 27017)
IT_juzi = client['IT_juzi']
info_list = IT_juzi ['info_list']
#### 河南地区公司获投状态分布图(柱状图) 制作 ####
for i in info_list.find():
print ( i['发展阶段'])
@wangkangdegithub
wangkangdegithub / lagou_detal_pages.py
Last active November 19, 2016 09:27
对拉勾网 数据挖掘子目录下的内容进行爬虫,得出对数据挖掘岗位的工作地点、薪酬等信息的抓取,并利用highcharts制作图表
#爬取相关30页网址,以及每一页中的链接,每一链接里面的内容、并把相关信息存放进数据库中:
#2016.11.17 由于没有加入代理ip列表,也没有设置headers,cookies,在抓取过程中,被拉勾网封Ip....教训!!!!!
from bs4 import BeautifulSoup
import requests,pymongo,re
import random
import time
host_url = 'https:'
start_url = 'https://www.lagou.com/zhaopin/shujuwajue/?filterOption=3'
client = pymongo.MongoClient('localhost', 27017)