Last active
June 26, 2022 09:27
-
-
Save Alireza-Farahani/adfa1c39940d113c19f97182145a1f48 to your computer and use it in GitHub Desktop.
Extra Scrapy item processors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# All of these process could be easily written with lambda functions. | |
# However in combining with Scrapy Compose and MapCompose, I think Processor objects are more consice and readable. | |
class DropLast: | |
def __call__(self, values: Sequence): | |
return values[:-1] | |
class TakeWhile: | |
def __init__(self, condition: Callable[[Any], bool]): | |
self.condition = condition | |
def __call__(self, values: Iterable[Any]): | |
return list(itertools.takewhile(self.condition, values)) | |
class Split: | |
def __init__(self, separator: Optional[str]=None): | |
self.separator = separator | |
def __call__(self, values: str): | |
return values.split() if self.separator else values.split(self.separator) | |
class TakeLast: | |
def __call__(self, values: Iterable): | |
if isinstance(values, Sequence): | |
return values[-1] | |
try: | |
item = None | |
for item in values: | |
pass | |
return item | |
except TypeError: | |
return None | |
class Extract: | |
def __init__(self, pattern: Union[str, Pattern], drop_if_not_found=False): | |
""" | |
:param pattern: regex pattern. Must include regex group notation. (e.g. r"\w+(world)") | |
:param drop_if_not_found: If true, in case original value doesn't match the pattern, | |
this processor doesn't pass original value to next processors. | |
Default is False. | |
""" | |
self.patter: Pattern | |
self.drop = drop_if_not_found | |
if isinstance(pattern, str): | |
self.patter = re.compile(pattern) | |
else: | |
self.patter = pattern | |
def __call__(self, value: str) -> Optional[str]: | |
match = re.search(self.patter, value) | |
if match: | |
return match.group(1) | |
return None if self.drop else value |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# DropLast | |
>>> DropLast()([1, 2, 3]) | |
[1, 2] | |
# TakeWhile | |
>>> TakeWhile(lambda p: "suggested articles" not in p.lower())( | |
... ["Full time: ManUtd 8-2 Arsenal", "Suggested articles: ManUtd downfall", "Follow us in Twitter"] | |
... ) | |
["Full time: ManUtd 8-2 Arsenal"] | |
# Split: | |
>>> Split(", ")("ManUtd, Man City") | |
["ManUtd", "Man City"] | |
>>> Compose(Split(), Join(" "))("University\t\t\tof\nToronto") | |
"University of Toronto" | |
# TakeLast | |
>>> TakeLast()([1, 2, 3]) | |
[3] | |
>>> TakeLast()(range(5)) | |
4 | |
# Extract | |
>>> Extract(r'[Mr\.|Mrs\.|Ms\.|Miss] (\w+[ \w]+)')("Mr. Alireza Farahani") | |
"Alireza Farahani" | |
>>> pattern = r'\d\. (.+)' | |
>>> MapCompose(Extract(pattern, drop_if_not_found=True))(["1. Arsenal", "Liverpool", "3. Man City"]) | |
["Arsenal", "Man City"] | |
>>> MapCompose(Extract(pattern, drop_if_not_found=False))(["1. Arsenal", "Liverpool", "3. Man City"]) | |
["Arsenal", "Liverpool", "Man City"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment