Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jpgreth/1af6d2399e8cb750814100579462cd3b to your computer and use it in GitHub Desktop.
Save jpgreth/1af6d2399e8cb750814100579462cd3b to your computer and use it in GitHub Desktop.
Scrape Facebook Groups for Apartments using Python
import os
import re
import sqlite3
import argparse
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from facebook_scraper import get_posts
# sqlite setup
conn = sqlite3.connect('posts.db')
conn.row_factory = sqlite3.Row
c = conn.cursor()
# Create table
c.execute("CREATE TABLE IF NOT EXISTS posts (post_text text, price real, number_of_rooms INTEGER, image_url text, post_id text PRIMARY KEY)")
class FacebookPost(object):
def __init__(self, post_text: str, price: int, image_url: str, post_id: str, number_of_rooms: int):
self.post_text = post_text
self.price = price
self.image_url = image_url
self.post_id = post_id
self.number_of_rooms = number_of_rooms
def save_post(post: FacebookPost):
c.execute("REPLACE INTO posts VALUES (?,?,?,?,?)", (
post.post_text,
post.price,
post.number_of_rooms,
post.image_url,
post.post_id
))
def get_posts_from_db():
posts = [FacebookPost(p[0], p[1], p[3], p[4], p[2]) for p in c.execute("SELECT * from posts")]
return posts
# argument parser setup
parser = argparse.ArgumentParser(description='Scrape Facebook for posts.')
parser.add_argument('--scrape', action='store_true')
parser.add_argument('--display', action='store_true')
parser.add_argument('--plot', action='store_true')
parser.add_argument('--minimum_price', action='store', type=int, default=5000)
parser.add_argument('--maximum_price', action='store', type=int, default=6000)
parser.add_argument('--minimum_rooms', action='store', type=int, default=3)
parser.add_argument('--maximum_rooms', action='store', type=int, default=3)
args = parser.parse_args()
def pretty_print(post: FacebookPost):
print("\n-----------Post----------\n")
print(f"\n PRICE: {post.price}\n")
if number_of_rooms(post):
print(f"\n number_of_rooms: {number_of_rooms(post)}\n")
print(post.post_text)
def price(post: FacebookPost):
searchResults = re.search('[^0-9]([0-9],{0,1}[0-9]{3})[^0-9]', post.post_text)
if searchResults:
return searchResults.group(1).replace(',', '')
return None
def append_price(post: FacebookPost):
hasPrice = price(post)
if hasPrice:
post.price = hasPrice
def within_price_budget(post: FacebookPost):
if post.price:
return int(post.price) >= flags['minimum_price'] and int(post.price) <= flags['maximum_price']
def right_size(post: FacebookPost):
if post.number_of_rooms:
return int(post.number_of_rooms) >= flags['minimum_rooms'] and int(post.number_of_rooms) <= flags['maximum_rooms']
def has_text(post: FacebookPost):
return len(post.post_text) > 1
def number_of_rooms(post: FacebookPost):
match = re.search('([0-9]) חדרים', post.post_text)
if match:
return match.group(1)
return None
def append_rooms(post: FacebookPost):
hasRooms = number_of_rooms(post)
if hasRooms:
post.number_of_rooms = hasRooms
def print_if_passes_filters(post: FacebookPost):
if passes_filters(post):
pretty_print(post)
def display():
for p in get_posts_from_db():
append_details_from_text(p)
print_if_passes_filters(p)
def append_details_from_text(post: FacebookPost):
append_price(post)
append_rooms(post)
def passes_filters(post: FacebookPost):
return all((
within_price_budget(post),
right_size(post),
has_text(post)
))
# parse args to flags
flags = vars(args)
if flags['display'] and not flags['scrape']:
display()
pages = 10
credentials = None # (os.getenv('fb_username'), os.getenv('fb_password'))
allGroups = itertools.chain(
get_posts(group=253957624766723, pages=pages, credentials=credentials), # דירות להשכרה ברמת גן
get_posts(group=115046608513246, pages=pages, credentials=credentials), # דירות מפה לאוזן בגבעתיים
get_posts(group=246902125410185, pages=pages, credentials=credentials), # דירות להשכרה ברמת גן / גבעתיים
)
if flags['plot']:
sns.set(style="whitegrid")
data = [(post.price, post.number_of_rooms) for post in get_posts_from_db() if (post.price and post.number_of_rooms)]
df = pd.DataFrame(data, columns=["price", "rooms"])
sns.jointplot(x="price", y="rooms", data=df, kind="kde");
plt.show()
if flags['scrape']:
for post in allGroups:
currentFacebookPost = FacebookPost(post.get('text'), None, post.get('image'), post.get('post_id'), None)
append_details_from_text(currentFacebookPost)
save_post(currentFacebookPost)
if flags['display']:
print_if_passes_filters(currentFacebookPost)
# Save (commit) the changes
conn.commit()
c.close()
facebook-scraper >= 0.2.3
seaborn
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment