Skip to content

Instantly share code, notes, and snippets.

@ziko442
Created December 22, 2020 14:34
Show Gist options
  • Save ziko442/d57d91da980e72414c725eb60878bc2d to your computer and use it in GitHub Desktop.
Save ziko442/d57d91da980e72414c725eb60878bc2d to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
This module provides utility functions that are used within the script.
"""
import os
import re
import sys
import time
import json
import errno
import random
import string
import logging
import datetime
from bs4 import BeautifulSoup as BeautifulSoup_
from xml.sax.saxutils import escape, unescape
import six
from six import iteritems
#from six.moves import html_parser
if sys.version_info[0] >= 3:
import html
else:
from six.moves import html_parser
html = html_parser.HTMLParser()
from six.moves.urllib.parse import ParseResult
from six.moves.urllib_parse import unquote_plus
# six.moves doesn’t support urlparse
if six.PY3: # pragma: no cover
from urllib.parse import urlparse, urljoin
else:
from urlparse import urlparse, urljoin
# Python3 (and six) don't provide string
if six.PY3:
from string import ascii_letters as string_ascii_letters
from string import digits as string_digits
else:
from string import letters as string_ascii_letters
from string import digits as string_digits
from .define import COURSERA_URL, WINDOWS_UNC_PREFIX
# Force us of bs4 with html.parser
def BeautifulSoup(page): return BeautifulSoup_(page, 'html.parser')
if six.PY2:
def decode_input(x):
stdin_encoding = sys.stdin.encoding
if stdin_encoding is None:
stdin_encoding = "UTF-8"
return x.decode(stdin_encoding)
else:
def decode_input(x):
return x
def spit_json(obj, filename):
with open(filename, 'w') as file_object:
json.dump(obj, file_object, indent=4)
def slurp_json(filename):
with open(filename) as file_object:
return json.load(file_object)
def is_debug_run():
"""
Check whether we're running with DEBUG loglevel.
@return: True if running with DEBUG loglevel.
@rtype: bool
"""
return logging.getLogger().isEnabledFor(logging.DEBUG)
def random_string(length):
"""
Return a pseudo-random string of specified length.
"""
valid_chars = string_ascii_letters + string_digits
return ''.join(random.choice(valid_chars) for i in range(length))
# Taken from: https://wiki.python.org/moin/EscapingHtml
# escape() and unescape() takes care of &, < and >.
HTML_ESCAPE_TABLE = {
'"': "&quot;",
"'": "&apos;"
}
HTML_UNESCAPE_TABLE = dict((v, k) for k, v in HTML_ESCAPE_TABLE.items())
def unescape_html(s):
h = html
s = h.unescape(s)
s = unquote_plus(s)
return unescape(s, HTML_UNESCAPE_TABLE)
def clean_filename(s, minimal_change=False):
"""
Sanitize a string to be used as a filename.
If minimal_change is set to true, then we only strip the bare minimum of
characters that are problematic for filesystems (namely, ':', '/' and
'\x00', '\n').
"""
# First, deal with URL encoded strings
h = html
s = h.unescape(s)
s = unquote_plus(s)
# Strip forbidden characters
# https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
s = (
s.replace(':', '-')
.replace('/', '-')
.replace('<', '-')
.replace('>', '-')
.replace('"', '-')
.replace('\\', '-')
.replace('|', '-')
.replace('?', '-')
.replace('*', '-')
.replace('\x00', '-')
.replace('\n', ' ')
)
# Remove trailing dots and spaces; forbidden on Windows
s = s.rstrip(' .')
if minimal_change:
return s
s = s.replace('(', '').replace(')', '')
s = s.rstrip('.') # Remove excess of trailing dots
s = s.strip().replace(' ', '_')
valid_chars = '-_.()%s%s' % (string.ascii_letters, string.digits)
return ''.join(c for c in s if c in valid_chars)
def normalize_path(path):
"""
Normalizes path on Windows OS. This means prepending
<backslash><backslash>?<backslash> to the path to get access to
Win32 device namespace instead of Win32 file namespace.
See https://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx#maxpath
@param path: Path to normalize.
@type path: str
@return: Normalized path.
@rtype str
"""
if sys.platform != 'win32':
return path
if path.startswith(WINDOWS_UNC_PREFIX):
return path
return WINDOWS_UNC_PREFIX + os.path.abspath(path)
def get_anchor_format(a):
"""
Extract the resource file-type format from the anchor.
"""
# (. or format=) then (file_extension) then (? or $)
# e.g. "...format=txt" or "...download.mp4?..."
fmt = re.search(r"(?:\.|format=)(\w+)(?:\?.*)?$", a)
return fmt.group(1) if fmt else None
def mkdir_p(path, mode=0o777):
"""
Create subdirectory hierarchy given in the paths argument.
"""
try:
os.makedirs(path, mode)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def clean_url(url):
"""
Remove params, query and fragment parts from URL so that `os.path.basename`
and `os.path.splitext` can work correctly.
@param url: URL to clean.
@type url: str
@return: Cleaned URL.
@rtype: str
"""
parsed = urlparse(url.strip())
reconstructed = ParseResult(
parsed.scheme, parsed.netloc, parsed.path,
params='', query='', fragment='')
return reconstructed.geturl()
def fix_url(url):
"""
Strip whitespace characters from the beginning and the end of the url
and add a default scheme.
"""
if url is None:
return None
url = url.strip()
if url and not urlparse(url).scheme:
url = "http://" + url
return url
def is_course_complete(last_update):
"""
Determine is the course is likely to have been terminated or not.
We return True if the timestamp given by last_update is 30 days or older
than today's date. Otherwise, we return True.
The intended use case for this is to detect if a given courses has not
seen any update in the last 30 days or more. Otherwise, we return True,
since it is probably too soon to declare the course complete.
"""
rv = False
if last_update >= 0:
delta = time.time() - last_update
max_delta = total_seconds(datetime.timedelta(days=30))
if delta > max_delta:
rv = True
return rv
def total_seconds(td):
"""
Compute total seconds for a timedelta.
Added for backward compatibility, pre 2.7.
"""
return (td.microseconds +
(td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6
def make_coursera_absolute_url(url):
"""
If given url is relative adds coursera netloc,
otherwise returns it without any changes.
"""
if not bool(urlparse(url).netloc):
return urljoin(COURSERA_URL, url)
return url
def extend_supplement_links(destination, source):
"""
Extends (merges) destination dictionary with supplement_links
from source dictionary. Values are expected to be lists, or any
data structure that has `extend` method.
@param destination: Destination dictionary that will be extended.
@type destination: @see CourseraOnDemand._extract_links_from_text
@param source: Source dictionary that will be used to extend
destination dictionary.
@type source: @see CourseraOnDemand._extract_links_from_text
"""
for key, value in iteritems(source):
if key not in destination:
destination[key] = value
else:
destination[key].extend(value)
def print_ssl_error_message(exception):
"""
Print SSLError message with URL to instructions on how to fix it.
"""
message = """
#####################################################################
# ATTENTION! PLEASE READ THIS!
#
# The following error has just occurred:
# %s %s
#
# Please read instructions on how to fix this error here:
# https://github.com/coursera-dl/coursera-dl#sslerror-errno-1-_sslc504-error14094410ssl-routinesssl3_read_bytessslv3-alert-handshake-failure
#####################################################################
""" % (type(exception).__name__, str(exception))
logging.error(message)
@dorbesh1
Copy link

dorbesh1 commented Sep 5, 2021

thank u for this

@TeymurovFuad
Copy link

works for me, python 3.10

@paarcaa
Copy link

paarcaa commented Feb 11, 2022

Thanks. That works for me

@jasonzjc
Copy link

Works for me. Thanks! Python 3.9.4.

@sepehr1414
Copy link

thank you it works perfectly :)) python 3.10

@sana33
Copy link

sana33 commented Jun 24, 2022

That worked for me on Windows 10 and it's been really useful!! Thank you very much ...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment