yogendratamang48/Free-Udemy-Courses-Crawler.md

## Free-Udemy-Courses-Crawler.md

      
    Raw
  

              Free-Udemy-Courses-Crawler.md
            
          
    Install Dependencies

pip3 install requests
pip3 install lxml

Run script


python3 smartybro.py


## smartybro.py
#! /usr/bin/env python3
"""
returns free coupons from SmartyBro page
"""
import requests
from lxml import html
from datetime import datetime, timedelta
from multiprocessing.pool import ThreadPool


url = "https://smartybro.com/category/udemy-coupon-100-off/"
resp = requests.get(url)
page = html.fromstring(resp.text)
ENV = 'prod'

def today_courses():
    for heading in page.xpath('//h2/a'):
        title = heading.xpath('text()')
        link = heading.xpath('@href')
        if title and link:
            raw_date = link[0].replace('https://smartybro.com/', '')[:10]
            clean_date = datetime.strptime(raw_date, '%Y/%m/%d')
            if clean_date.date() >= (datetime.now()-timedelta(days=1)).date():
                yield (title[0], link[0])

def parse_page(tuple_data):
    """
    args:
    tuple_data - (title, link) tuple iterator
    """
    title = tuple_data[0]
    link = tuple_data[1]
    resp = requests.get(link)
    page = html.fromstring(resp.content)
    links = page.xpath('//a[contains(@href, "couponCode")]/@href')
    if links:
        return links[0]
    return

def fake_courses():
    """test fake_course function
    """
    return [
        ('Test',
        'https://smartybro.com/2020/10/29/catia-v5-3d-design-modeling-course-for-beginner/')
    ]

if __name__ == '__main__':
    fetch_function = today_courses if ENV == 'prod' else fake_courses
    with ThreadPool(5) as TP:
        links = TP.map(parse_page, fetch_function())
        links = list(filter(lambda x: x is not None, links))
        links = list(map(lambda x: x.split('&')[0], links))
        for _link in links:
            print(_link.strip())
	#! /usr/bin/env python3
	"""
	returns free coupons from SmartyBro page
	"""
	import requests
	from lxml import html
	from datetime import datetime, timedelta
	from multiprocessing.pool import ThreadPool



	url = "https://smartybro.com/category/udemy-coupon-100-off/"
	resp = requests.get(url)
	page = html.fromstring(resp.text)
	ENV = 'prod'

	def today_courses():
	for heading in page.xpath('//h2/a'):
	title = heading.xpath('text()')
	link = heading.xpath('@href')
	if title and link:
	raw_date = link[0].replace('https://smartybro.com/', '')[:10]
	clean_date = datetime.strptime(raw_date, '%Y/%m/%d')
	if clean_date.date() >= (datetime.now()-timedelta(days=1)).date():
	yield (title[0], link[0])

	def parse_page(tuple_data):
	"""
	args:
	tuple_data - (title, link) tuple iterator
	"""
	title = tuple_data[0]
	link = tuple_data[1]
	resp = requests.get(link)
	page = html.fromstring(resp.content)
	links = page.xpath('//a[contains(@href, "couponCode")]/@href')
	if links:
	return links[0]
	return

	def fake_courses():
	"""test fake_course function
	"""
	return [
	('Test',
	'https://smartybro.com/2020/10/29/catia-v5-3d-design-modeling-course-for-beginner/')
	]

	if __name__ == '__main__':
	fetch_function = today_courses if ENV == 'prod' else fake_courses
	with ThreadPool(5) as TP:
	links = TP.map(parse_page, fetch_function())
	links = list(filter(lambda x: x is not None, links))
	links = list(map(lambda x: x.split('&')[0], links))
	for _link in links:
	print(_link.strip())