Last active
April 17, 2021 10:39
-
-
Save aybukemeydan/76648a075fda70e76e163a17b563467b to your computer and use it in GitHub Desktop.
Collecting data from real estate site.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import requests | |
from bs4 import BeautifulSoup as bs | |
import re | |
headers = {'User-Agent':'Mozilla/5.0'} | |
def getAndParse(headers,page,isitma): # Sayfaya request gönderiyoruz. | |
response=requests.get(f"https://www.hurriyetemlak.com/sariyer-kiralik/daire?p10={isitma}&page="+str(page),headers) | |
soup=bs(response.text) | |
return soup | |
def gethouseItem(soup): # Çoğu veri bunun içinde burdan ayıklayacağım. | |
houseItem=[item for item in soup.find_all("div",class_=re.compile('listing-item'))] | |
return houseItem | |
def getPrice(soup): # Fiyat bilgisi | |
price=[price for price in soup.find_all(class_=re.compile("list-view-price"))] | |
return price | |
def getLoc(soup): # Lokasyon bilgisi | |
loc=[item for item in soup.find_all(class_="list-view-location")] | |
return loc | |
def getInfo(soup): #İlan bilgisi | |
inf=[bilgi for bilgi in soup.find_all(class_=re.compile("list-view-header"))] | |
return inf | |
# Listeleri dolduracağız. | |
# currency=[] hepsinin para birimi TL | |
celly_houseRoomCount=[] # Oda sayısı | |
celly_buildingAge=[] # Bina yaşı | |
celly_floortype=[] # Kat bilgisi | |
m2=[] | |
location=[] # Mahalle bilgisi | |
days=[] #İlan verilme tarihi | |
pr=[] # ücret | |
info=[] # İlan | |
isitma=[] #Bina ısıtma tipi | |
# Şimdi sayfadan sırayla bilgileri çekelim. | |
for isi in ([101301,101302,101303,101304,101305,101306,101307,101308,101309,101310]): # sayfada bu formatta ısıtma tipleri verilmiş | |
for page in range(1,28): # 28 sayfayı sırasıyla çekeceğiz | |
soup=getAndParse(headers,page,isi) | |
houseItem=gethouseItem(soup) | |
for i in range(len(houseItem)): | |
loc=getLoc(soup) | |
inf=getInfo(soup) | |
info.append(inf[i].text) | |
price=getPrice(soup) | |
pr.append(price[i].text) | |
isitma.append(isi) | |
days.append(houseItem[i].find(class_="list-view-date").text) | |
celly_houseRoomCount.append(houseItem[i].find(class_="celly houseRoomCount").text) | |
celly_buildingAge.append(houseItem[i].find_all(class_="celly buildingAge")[0].text) | |
m2.append(houseItem[i].find(class_=re.compile("celly squareMeter list-view")).text) | |
k=loc[i].find_all("span") | |
location.append(k[0].text+k[1].text) | |
if(not houseItem[i].find(class_="celly floortype")): | |
celly_floortype+=["boş"] # bazı ilanlarda kat bilgisi gözükmüyor. | |
else: | |
celly_floortype.append(houseItem[i].find(class_="celly floortype").text) | |
# Çektikten sonra DataFrame'e aktaralım | |
df = pd.DataFrame({"m2":m2, "celly_houseRoomCount":celly_houseRoomCount,"description":info,"isitma":isitma, | |
"celly_buildingAge":celly_buildingAge, "celly_floortype":celly_floortype,"location":location, | |
"days":days,"price":pr}) # Ve bu bilgileri DataFrame'e aktaralım | |
cols=["m2","price"] | |
for col in cols: | |
df[col]=df[col].str.strip("\n") # Veriyi temizliyoruz. | |
df.m2=df.m2.str.strip(" m2") | |
df.drop_duplicates(inplace=True) # Duplike verileri atalım. | |
def fix_isitma(isi): | |
if(isi==101302): | |
return "Kat Kaloriferi" | |
elif(isi==101303): | |
return "Klima" | |
elif(isi==101304): | |
return "Kombi" | |
elif(isi==101305): | |
return "Merkezi" | |
elif(isi==101306): | |
return "Soba" | |
elif(isi==101308): | |
return "Isıtma Yok" | |
elif(isi==101309): | |
return "Belirtilmemiş" | |
else: | |
return "Merkezi (Pay Ölçer)" | |
df.isitma=df.isitma.map(fix_isitma) # Sayfadan çektiğimiz rakamları anlamlı hale çevirelim. | |
df.to_csv("emlak.csv") # Kaydedelim. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment