Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save mauromarano/d424b2d76722616f5547fd77a2b891d7 to your computer and use it in GitHub Desktop.
Save mauromarano/d424b2d76722616f5547fd77a2b891d7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#imorting dependencies\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"base_url = 'http://www.subito.it/annunci-italia/vendita/appartamenti/?o='\n",
"pagine_da_cercare = 100\n",
"\n",
"def get_source( url,page=1):\n",
" url = url + str(page)\n",
" r = requests.get(url)\n",
" return r.content\n",
"\n",
"\n",
"#html_sources = get_source(base_url,1)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_data_from_page(source):\n",
"\n",
" apartments = []\n",
" soup = BeautifulSoup(source,\"html.parser\")\n",
" for apartment in soup.select('.item_description'):\n",
" try:\n",
" price = apartment.select('.item_price')[0].string\n",
" price = price.split(\" \")[0].strip()\n",
" specs = apartment.select('.item_specs')[0].string\n",
" mq = specs.split(\"mq\")[0]\n",
" mq = mq.split(\"-\")[1].strip()\n",
" city = apartment.select('.item_city')[0].string\n",
" city = city.split(\"(\")[1]\n",
" city = city.split(')')[0].strip()\n",
"\n",
" apartment = {\n",
" \"price\" : price,\n",
" \"mq\" : mq,\n",
" \"city\" : city\n",
" }\n",
" apartments.append(apartment)\n",
" except:\n",
" print \"Some values is missing\"\n",
" \n",
" return apartments\n",
" \n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n",
"Some values is missing\n"
]
}
],
"source": [
"#ogni elemento dell'array è un oggetto con i dati dell appartamento\n",
"apartaments = []\n",
"for i in range(pagine_da_cercare):\n",
" sources = get_source(base_url,i)\n",
" apartaments.append(get_data_from_page(sources))\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"\n",
"#final data\n",
"data = []\n",
"for apartament in apartaments:\n",
" for a in apartament:\n",
" city = a['city']\n",
" price = a['price']\n",
" mq = a['mq']\n",
" data.append([city,mq,price])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ci sono un totale di 2823 appartamenti \n"
]
}
],
"source": [
"#citta, mq\n",
"x = []\n",
"\n",
"#prezzo\n",
"y = []\n",
"\n",
"\n",
"for a in data:\n",
" x.append([a[0],a[1]])\n",
" y.append(a[2])\n",
"\n",
" \n",
"x_citta = []\n",
"for citta in x:\n",
" x_citta.append(citta[0])\n",
"\n",
"from sklearn import preprocessing\n",
"le = preprocessing.LabelEncoder()\n",
"le.fit(x_citta)\n",
"citta_transformate = le.transform(x_citta)\n",
"#le.inverse_transform([28,46])\n",
"#le.transform(['IM'])\n",
"\n",
"i = 0\n",
"for citta_transformata in citta_transformate:\n",
" x[i][0] = citta_transformata\n",
" i = i +1\n",
" \n",
"print \"Ci sono un totale di \" + str(len(data)) + \" appartamenti \""
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[14]\n",
"[u'178.000']\n"
]
}
],
"source": [
"from sklearn import tree\n",
"\n",
"clf = tree.DecisionTreeClassifier()\n",
"clf = clf.fit(x,y)\n",
"citta = le.transform(['BO'])\n",
"print citta\n",
"prediction = clf.predict([[citta,\"145\"]])\n",
"print prediction"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment