laing20333 laing20333

## mp_crawler.py
# coding=utf-8
# Goal: parse house information for each district from websites
# for each district, get 「土地區段位置或建物區門牌」,「建物型態」,「建物現況格局」,「坪數」,「屋齡」,「總價元」,「資料來源」into csv file

# Procedure:
# 1. get the number of page for each district by parsing first html content
# 2. for each district put all html page together, use htmlparser to parse content and save data into file

import sys
import math

## profiling.py
#coding=utf-8
import distance_fun
import timeit
import numpy as np
import matplotlib.pyplot as plt

t_python = timeit.Timer("clustering.kp_distance(np.array([1, 2, 4, '住', '公寓']), np.array([2, 2, 2, '商', '公寓']), 0.5)" , "import clustering\nimport numpy as np")
time_python = t_python.timeit(1000000)
print 'python code execution time: ' + str(time_python) + 's'

## distance_fun.py
def kp_distance(object_x, object_y, Wc):
    ''' Distance function for two objects
        '''
    res = 0.0
    for attr_idx in xrange(0, len(object_x)):
        cur_type = type(object_x[attr_idx])
        if( (cur_type == str) or (cur_type == np.string_) ):
            # categorical attribute
            if (object_x[attr_idx] == object_y[attr_idx]):
                res = res + Wc

## distance_fun.pyx
cimport numpy as np
cimport cython
import numpy as np
def kp_distance(np.ndarray object_x, np.ndarray object_y, float Wc):
    ''' Distance function for two objects
        '''
    cdef float res = 0.0
    cdef unsigned attr_idx
    for attr_idx in xrange(0, len(object_x)):
        cur_type = type(object_x[attr_idx])

## sp_crawler.py
# coding=utf-8
# Goal:
# parse different city's website
# for each city, get  「土地區段位置或建物區門牌」,「建物型態」,「建物現況格局」,「坪數」,「屋齡」,「總價元」,「資料來源」into csv file

# step:
# 1. get number of page by parsing string
# 2. put all html page together, use htmlparser to get content
# 3. parsing content and save it

## Cure.cpp
#include "Cure.h"
#include<iostream>
#include<cmath>
#include<algorithm>
#include<memory.h>

using namespace std;

bool operator< (const struct CLUSTER_PAIR &a, const struct CLUSTER_PAIR &b){
	return (a.distance < b.distance);
	# coding=utf-8
	# Goal: parse house information for each district from websites
	# for each district, get 「土地區段位置或建物區門牌」,「建物型態」,「建物現況格局」,「坪數」,「屋齡」,「總價元」,「資料來源」into csv file

	# Procedure:
	# 1. get the number of page for each district by parsing first html content
	# 2. for each district put all html page together, use htmlparser to parse content and save data into file

	import sys
	import math
	#coding=utf-8
	import distance_fun
	import timeit
	import numpy as np
	import matplotlib.pyplot as plt

	t_python = timeit.Timer("clustering.kp_distance(np.array([1, 2, 4, '住', '公寓']), np.array([2, 2, 2, '商', '公寓']), 0.5)" , "import clustering\nimport numpy as np")
	time_python = t_python.timeit(1000000)
	print 'python code execution time: ' + str(time_python) + 's'
	def kp_distance(object_x, object_y, Wc):
	''' Distance function for two objects
	'''
	res = 0.0
	for attr_idx in xrange(0, len(object_x)):
	cur_type = type(object_x[attr_idx])
	if( (cur_type == str) or (cur_type == np.string_) ):
	# categorical attribute
	if (object_x[attr_idx] == object_y[attr_idx]):
	res = res + Wc
	cimport numpy as np
	cimport cython
	import numpy as np
	def kp_distance(np.ndarray object_x, np.ndarray object_y, float Wc):
	''' Distance function for two objects
	'''
	cdef float res = 0.0
	cdef unsigned attr_idx
	for attr_idx in xrange(0, len(object_x)):
	cur_type = type(object_x[attr_idx])
	# coding=utf-8
	# Goal:
	# parse different city's website
	# for each city, get 「土地區段位置或建物區門牌」,「建物型態」,「建物現況格局」,「坪數」,「屋齡」,「總價元」,「資料來源」into csv file

	# step:
	# 1. get number of page by parsing string
	# 2. put all html page together, use htmlparser to get content
	# 3. parsing content and save it
	#include "Cure.h"
	#include<iostream>
	#include<cmath>
	#include<algorithm>
	#include<memory.h>

	using namespace std;

	bool operator< (const struct CLUSTER_PAIR &a, const struct CLUSTER_PAIR &b){
	return (a.distance < b.distance);