Skip to content

Instantly share code, notes, and snippets.

@laing20333
laing20333 / mp_crawler.py
Last active June 3, 2017 01:27
Multi-Processes Crawler (in python 2.x version)
# coding=utf-8
# Goal: parse house information for each district from websites
# for each district, get 「土地區段位置或建物區門牌」,「建物型態」,「建物現況格局」,「坪數」,「屋齡」,「總價元」,「資料來源」into csv file
# Procedure:
# 1. get the number of page for each district by parsing first html content
# 2. for each district put all html page together, use htmlparser to parse content and save data into file
import sys
import math
#coding=utf-8
import distance_fun
import timeit
import numpy as np
import matplotlib.pyplot as plt
t_python = timeit.Timer("clustering.kp_distance(np.array([1, 2, 4, '住', '公寓']), np.array([2, 2, 2, '商', '公寓']), 0.5)" , "import clustering\nimport numpy as np")
time_python = t_python.timeit(1000000)
print 'python code execution time: ' + str(time_python) + 's'
@laing20333
laing20333 / distance_fun.py
Last active August 29, 2015 14:10
python example
def kp_distance(object_x, object_y, Wc):
''' Distance function for two objects
'''
res = 0.0
for attr_idx in xrange(0, len(object_x)):
cur_type = type(object_x[attr_idx])
if( (cur_type == str) or (cur_type == np.string_) ):
# categorical attribute
if (object_x[attr_idx] == object_y[attr_idx]):
res = res + Wc
@laing20333
laing20333 / distance_fun.pyx
Created December 1, 2014 11:36
cython example
cimport numpy as np
cimport cython
import numpy as np
def kp_distance(np.ndarray object_x, np.ndarray object_y, float Wc):
''' Distance function for two objects
'''
cdef float res = 0.0
cdef unsigned attr_idx
for attr_idx in xrange(0, len(object_x)):
cur_type = type(object_x[attr_idx])
@laing20333
laing20333 / sp_crawler.py
Last active August 4, 2016 19:28
Single process Crawler (in python 2.x version)
# coding=utf-8
# Goal:
# parse different city's website
# for each city, get 「土地區段位置或建物區門牌」,「建物型態」,「建物現況格局」,「坪數」,「屋齡」,「總價元」,「資料來源」into csv file
# step:
# 1. get number of page by parsing string
# 2. put all html page together, use htmlparser to get content
# 3. parsing content and save it
#include "Cure.h"
#include<iostream>
#include<cmath>
#include<algorithm>
#include<memory.h>
using namespace std;
bool operator< (const struct CLUSTER_PAIR &a, const struct CLUSTER_PAIR &b){
return (a.distance < b.distance);