Elias Dabbas eliasdabbas

## sitemap.xml.gz
<? xml version = "1.0" encoding = "UTF-8"?>

<urlset xmlns = "http://www.sitemaps.org/schemas/sitemap/0.9"
        xmlns: xhtml = "http://www.w3.org/1999/xhtml">

   <url>

      <loc> http://www.example.com/ </loc>

      <lastmod> 2005-01-01 </lastmod>

## user_agent_to_data_frame.py
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
    'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
    'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
    'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
    'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',

## to_crawl_or_not_to_crawl.py
from urllib.parse import urlsplit, parse_qs
import re


def crawl_or_not(url,
                 exclude_url_params=None,
                 include_url_params=None,
                 include_url_pattern=None,
                 exclude_url_pattern=None):
    """Check if ``url`` will be crawled or not given the supplied conditions.

## get_bot_ip_addresses.py
import ipaddress

import requests
import pandas as pd

def bot_ip_addresses():
    bots_urls = {
        'google': 'https://developers.google.com/search/apis/ipranges/googlebot.json',
        'bing': 'https://www.bing.com/toolbox/bingbot.json'
    }

## score_links.py
import networkx as nx
import pandas as pd

def score_links(links_file, domain):
    """Score a network on links based on their importance and centrality.

    links_file: Path to the file having the links (needs a "Source" and
                "Destination" columns) e.g. ScreamingFrog's outlinks file.
    domain: Filter all links, making sure they all point to the domain you want.
    """

## bert_pipeline_unmasker.py
# !pip install --upgrade transformers plotly pandas

import plotly.graph_objects as go
import pandas as pd
pd.options.display.max_columns = None
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')

results = []
cars = ['mercedes', 'audi', 'bmw', 'volkswagen', 'ford', 'toyota',

## robots_sitemaps_urls_wordfreq.sh
# pip install advertools==0.14.0a7

# get the robots.txt file, save to csv:
advertools robots --url https://www.economist.com/robots.txt econ_robots.csv

# find lines that start with sitemap, save to variable sitemap_url
sitemap_url=$(grep ^sitemap -i econ_robots.csv | cut -d , -f 2)

# get the sitemap index file without downloading the sub-sitemaps (not recursive),
advertools sitemaps $sitemap_url econ_sitemap.csv --recursive 0

## parse_news_sitemaps.py
import datetime

import advertools as adv
import pandas as pd


stopwords = ['to', 'of', 'the', 'in', 'for', 'and', 'on', 'a', 'as', 'with',
             'from', 'over', 'is', 'at', '—', '-', 'be', '2022', '–', 'it', 'by',
             'we', 'why', 'but', 'my', 'how', 'not', 'an', 'are', 'no', 'go',
             'your', 'up', 'his']

## flag.py
from unicodedata import lookup

def flag(cc):
    l1 = lookup(f'REGIONAL INDICATOR SYMBOL LETTER {cc[0]}')
    l2 = lookup(f'REGIONAL INDICATOR SYMBOL LETTER {cc[1]}')
    return l1 + l2

## crawl_multiple_sites.py
from urllib.parse import urlsplit

import advertools as adv


sites = [
    'https://www.who.int',
    'https://www.nytimes.com',
    'https://www.washingtonpost.com',
]
	<? xml version = "1.0" encoding = "UTF-8"?>

	<urlset xmlns = "http://www.sitemaps.org/schemas/sitemap/0.9"
	xmlns: xhtml = "http://www.w3.org/1999/xhtml">

	<url>

	<loc> http://www.example.com/ </loc>

	<lastmod> 2005-01-01 </lastmod>
	user_agents = [
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
	'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
	'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
	'Mozilla/5.0 (Windows NT 6.1; rv:64.0) Gecko/20100101 Firefox/64.0',
	'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
	'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
	'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
	'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
	from urllib.parse import urlsplit, parse_qs
	import re


	def crawl_or_not(url,
	exclude_url_params=None,
	include_url_params=None,
	include_url_pattern=None,
	exclude_url_pattern=None):
	"""Check if ``url`` will be crawled or not given the supplied conditions.
	import ipaddress

	import requests
	import pandas as pd

	def bot_ip_addresses():
	bots_urls = {
	'google': 'https://developers.google.com/search/apis/ipranges/googlebot.json',
	'bing': 'https://www.bing.com/toolbox/bingbot.json'
	}
	import networkx as nx
	import pandas as pd

	def score_links(links_file, domain):
	"""Score a network on links based on their importance and centrality.

	links_file: Path to the file having the links (needs a "Source" and
	"Destination" columns) e.g. ScreamingFrog's outlinks file.
	domain: Filter all links, making sure they all point to the domain you want.
	"""
	# !pip install --upgrade transformers plotly pandas

	import plotly.graph_objects as go
	import pandas as pd
	pd.options.display.max_columns = None
	from transformers import pipeline
	unmasker = pipeline('fill-mask', model='bert-base-uncased')

	results = []
	cars = ['mercedes', 'audi', 'bmw', 'volkswagen', 'ford', 'toyota',
	# pip install advertools==0.14.0a7

	# get the robots.txt file, save to csv:
	advertools robots --url https://www.economist.com/robots.txt econ_robots.csv

	# find lines that start with sitemap, save to variable sitemap_url
	sitemap_url=$(grep ^sitemap -i econ_robots.csv \| cut -d , -f 2)

	# get the sitemap index file without downloading the sub-sitemaps (not recursive),
	advertools sitemaps $sitemap_url econ_sitemap.csv --recursive 0
	import datetime

	import advertools as adv
	import pandas as pd


	stopwords = ['to', 'of', 'the', 'in', 'for', 'and', 'on', 'a', 'as', 'with',
	'from', 'over', 'is', 'at', '—', '-', 'be', '2022', '–', 'it', 'by',
	'we', 'why', 'but', 'my', 'how', 'not', 'an', 'are', 'no', 'go',
	'your', 'up', 'his']
	from unicodedata import lookup

	def flag(cc):
	l1 = lookup(f'REGIONAL INDICATOR SYMBOL LETTER {cc[0]}')
	l2 = lookup(f'REGIONAL INDICATOR SYMBOL LETTER {cc[1]}')
	return l1 + l2
	from urllib.parse import urlsplit

	import advertools as adv


	sites = [
	'https://www.who.int',
	'https://www.nytimes.com',
	'https://www.washingtonpost.com',
	]