Skip to content

Instantly share code, notes, and snippets.

@Alireza-Farahani
Last active June 26, 2022 09:27
Show Gist options
  • Save Alireza-Farahani/adfa1c39940d113c19f97182145a1f48 to your computer and use it in GitHub Desktop.
Save Alireza-Farahani/adfa1c39940d113c19f97182145a1f48 to your computer and use it in GitHub Desktop.
Extra Scrapy item processors
# All of these process could be easily written with lambda functions.
# However in combining with Scrapy Compose and MapCompose, I think Processor objects are more consice and readable.
class DropLast:
def __call__(self, values: Sequence):
return values[:-1]
class TakeWhile:
def __init__(self, condition: Callable[[Any], bool]):
self.condition = condition
def __call__(self, values: Iterable[Any]):
return list(itertools.takewhile(self.condition, values))
class Split:
def __init__(self, separator: Optional[str]=None):
self.separator = separator
def __call__(self, values: str):
return values.split() if self.separator else values.split(self.separator)
class TakeLast:
def __call__(self, values: Iterable):
if isinstance(values, Sequence):
return values[-1]
try:
item = None
for item in values:
pass
return item
except TypeError:
return None
class Extract:
def __init__(self, pattern: Union[str, Pattern], drop_if_not_found=False):
"""
:param pattern: regex pattern. Must include regex group notation. (e.g. r"\w+(world)")
:param drop_if_not_found: If true, in case original value doesn't match the pattern,
this processor doesn't pass original value to next processors.
Default is False.
"""
self.patter: Pattern
self.drop = drop_if_not_found
if isinstance(pattern, str):
self.patter = re.compile(pattern)
else:
self.patter = pattern
def __call__(self, value: str) -> Optional[str]:
match = re.search(self.patter, value)
if match:
return match.group(1)
return None if self.drop else value
# DropLast
>>> DropLast()([1, 2, 3])
[1, 2]
# TakeWhile
>>> TakeWhile(lambda p: "suggested articles" not in p.lower())(
... ["Full time: ManUtd 8-2 Arsenal", "Suggested articles: ManUtd downfall", "Follow us in Twitter"]
... )
["Full time: ManUtd 8-2 Arsenal"]
# Split:
>>> Split(", ")("ManUtd, Man City")
["ManUtd", "Man City"]
>>> Compose(Split(), Join(" "))("University\t\t\tof\nToronto")
"University of Toronto"
# TakeLast
>>> TakeLast()([1, 2, 3])
[3]
>>> TakeLast()(range(5))
4
# Extract
>>> Extract(r'[Mr\.|Mrs\.|Ms\.|Miss] (\w+[ \w]+)')("Mr. Alireza Farahani")
"Alireza Farahani"
>>> pattern = r'\d\. (.+)'
>>> MapCompose(Extract(pattern, drop_if_not_found=True))(["1. Arsenal", "Liverpool", "3. Man City"])
["Arsenal", "Man City"]
>>> MapCompose(Extract(pattern, drop_if_not_found=False))(["1. Arsenal", "Liverpool", "3. Man City"])
["Arsenal", "Liverpool", "Man City"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment