Skip to content

Instantly share code, notes, and snippets.

@b1tninja
Created April 30, 2019 18:35
Show Gist options
  • Save b1tninja/8bbf6cf1d543d3235842458cf1162c8c to your computer and use it in GitHub Desktop.
Save b1tninja/8bbf6cf1d543d3235842458cf1162c8c to your computer and use it in GitHub Desktop.
Selenium myspace scraper
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from pprint import pprint
from urllib.parse import urljoin
profiles = dict()
myspace_url = 'https://www.myspace.com/'
img_suffix = '600x600.jpg'
visited = set()
class MyspaceProfile:
def __init__(self, profile_id, profile_href, profile_name, profile_img, top8=None, connections_out=None,
connections_in=None):
global profiles
assert profile_href not in profiles
profile_id = int(profile_id)
profile_img = profile_img[:-11] + img_suffix
profile_href = profile_href.rstrip('/').rpartition('/')[2]
if top8 is None:
top8 = []
if connections_out is None:
connections_out = []
# connections_out = get_connections_out(profile_href)
if connections_in is None:
connections_out = []
# connections_in = get_connections_in(profile_href)
profile = locals()
del profile['self']
self.__dict__.update(profile)
profiles[profile_href] = self
def __hash__(self):
return self.profile_id
def __repr__(self):
return repr(self.__dict__)
# return dict(profile_id=self,
# profile_href=profiles[self].profile_href,
# profile_name=profiles[self].profile_name,
# profile_img=profiles[self].profile_img,
# top8=[repr(profile) for profile in profiles[self].top8],
# connections_out=[repr(profile) for profile in profiles[self].connections_out],
# connections_in=[repr(profile) for profile in profiles[self].connections_in])
def get_connections_out(username):
pass
def get_connections_in(username):
pass
def get_photos(driver):
elems = driver.find_elements_by_xpath(
'//ul[@id="photosContainer"]/li//div[@data-type="photo"]')
for elem in elems:
_fields = ["data-id", "data-playlist-id", "data-entity-key", "data-is-connected", "data-is-owner",
"data-mix-entity-key", "data-image-url", "data-title", "data-owner", "data-owner-entity-key",
"data-is-liked"]
yield dict([(field, elem.get_attribute(field)) for field in _fields])
def walk_profile(username, maxDepth=None):
global profiles
for profile in profiles:
if username.tolower() == profile.profile_href.tolower():
return profile
url = myspace_url + username
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument('headless')
prefs = {"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096}
chromeOptions.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=chromeOptions)
driver.get(url)
try:
top8elems = driver.find_elements_by_xpath('//div[@id="topFriends"]//ul//li/a')
top8 = []
for friend in top8elems:
friend_profile = MyspaceProfile(
profile_href=friend.get_attribute('href'),
profile_id=friend.get_attribute('data-profileid'),
profile_name=friend.get_attribute('data-title'),
profile_img=friend.get_attribute('data-image-url'))
top8.append(friend_profile)
sidebar = driver.find_element_by_xpath('//section[@id="sidebar"]')
profile_a = sidebar.find_element_by_xpath('//a[@data-click-object-type="ProfileContextualNav"]')
profile = MyspaceProfile(
profile_id=sidebar.find_element_by_xpath('//div[@data-id]').get_attribute('data-id'),
profile_img=sidebar.find_element_by_xpath(
'//div[@data-tour-id="profileimageedit"]//img').get_attribute('src'),
profile_href=profile_a.get_attribute('href'),
profile_name=profile_a.text,
top8=top8)
# Get photos
url = myspace_url + username + '/photos'
driver.get(url)
for photo in get_photos(driver):
pprint(photo)
# Get mixes
url = myspace_url + username + '/mixes'
driver.get(url)
mixitems = driver.find_elements_by_xpath('//div[contains(@class, "mixtapes")]//ul[@id="mixGrid"]/li//button[@data-type="mixtape"]')
mixurls = []
for mixitem in mixitems:
mixitem_fields = ["data-type", "data-profile-id", "data-playlist-id", "data-object-version", "data-entity-key", "data-title", "data-username", "data-owner", "data-url", "data-uid", "data-image-url", "data-media-count", "data-embed-url"]
d = dict([(field, mixitem.get_attribute(field)) for field in mixitem_fields])
mixurls.append(d.get('data-url'))
for mixurl in mixurls:
driver.get(urljoin(myspace_url, mixurl))
driver.execute_script('window.scrollBy(document.body.scrollWidth,0)')
for photo in get_photos(driver):
pprint(photo)
#
# # Get connections to
# url = myspace_url + username + '/connections/out'
# driver.get(url)
#
# profile_grid = driver.find_element_by_xpath('//div[@class="horizontalContent" and /div/ul[@id="profileGrid"]')
# # profile_grid.
#
# # Get connections from
# url = myspace_url + username + '/connections/in'
# driver.get(url)
#
# profile_grid = driver.find_element_by_xpath('//div[@class="horizontalContent" and /div/ul[@id="profileGrid"]')
# # profile_grid.
except WebDriverException as e:
print(e)
finally:
driver.close()
return profile
#
# comm.send("screen.trackers", {
# scrollType: "horizontal",
# pageNum: ++k
# });
pprint(walk_profile("username"), width=-1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment