Created
July 12, 2021 05:40
-
-
Save nghiahsgs/1b7ec14f7eaff95a50bce38f9aa32c7c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class String_Interact(): | |
def __init__(self): | |
pass | |
def regex_one_value(self,pattern, input_str): | |
regex1=re.compile(pattern) | |
kq=regex1.search(input_str) | |
if kq: | |
kq=kq.group(1) | |
else: | |
kq='' | |
return kq | |
def regex_many_value(self,pattern, input_str): | |
regex1=re.compile(pattern) | |
kq=regex1.findall(input_str) | |
return kq | |
def convert(self,text): | |
patterns = { | |
'[àáảãạăắằẵặẳâầấậẫẩ]': 'a', | |
'[đ]': 'd', | |
'[èéẻẽẹêềếểễệ]': 'e', | |
'[ìíỉĩị]': 'i', | |
'[òóỏõọôồốổỗộơờớởỡợ]': 'o', | |
'[ùúủũụưừứửữự]': 'u', | |
'[ỳýỷỹỵ]': 'y' | |
} | |
""" | |
Convert from 'Tieng Viet co dau' thanh 'Tieng Viet khong dau' | |
text: input string to be converted | |
Return: string converted | |
""" | |
output = text | |
for regex, replace in patterns.items(): | |
output = re.sub(regex, replace, output) | |
# deal with upper case | |
output = re.sub(regex.upper(), replace.upper(), output) | |
return output | |
def encode_tieng_viet_html(self,string_input): | |
dict_char={ | |
'à':'à', | |
'á':'á', | |
'â':'â', | |
'ã':'ã', | |
'À':'À', | |
'Á':'Á', | |
'Â':'Â', | |
'Ã':'Ã', | |
'è':'è', | |
'é':'é', | |
'ê':'ê', | |
'È':'È', | |
'É':'É', | |
'Ê':'Ê', | |
'ì':'ì', | |
'í':'í', | |
'Ì':'Ì', | |
'Í':'Í', | |
'ò':'ò', | |
'ó':'ó', | |
'ô':'ô', | |
'Ò':'Ò', | |
'Ó':'Ó', | |
'Ô':'Ô', | |
'ù':'ù', | |
'ú':'ú', | |
'Ù':'Ù', | |
'Ú':'Ú' | |
} | |
string_output=[] | |
for char in string_input: | |
# print(char) | |
if char in dict_char: | |
string_output+=dict_char[char] | |
else: | |
string_output+=char | |
return ''.join(string_output) | |
def md5_encode(self,text): | |
hash_object = hashlib.md5(text.encode()) | |
md5_hash = hash_object.hexdigest() | |
return md5_hash | |
def keep_normal_char(self, full_string): | |
list_valid_char = 'qwertyuiopasdfghjklzxcvbnm' | |
list_valid_char += 'qwertyuiopasdfghjklzxcvbnm'.upper() | |
list_valid_char += '0123456789' | |
list_valid_char += ' ' | |
kq = '' | |
for char in full_string: | |
if char in list_valid_char: | |
kq += char | |
return kq | |
def get_element_by_css_selector(self, data, css_selector): | |
soup = bs4.BeautifulSoup(data, 'html.parser') | |
eles=soup.select(css_selector) | |
return eles #get_text() and get('href') | |
def extractListTextByCSSSelector(self,data,listCSSSelector,listeAttr): | |
list_L_result = [] | |
for index, _ in enumerate(listCSSSelector): | |
CSSSelector = listCSSSelector[index] | |
Attr = listeAttr[index] | |
L_result = self.get_element_by_css_selector(data,CSSSelector) | |
if Attr == "innerText": | |
L_result = [img_src.get_text().replace("\n","").replace("\t","").replace("\r","").strip() for img_src in L_result] | |
elif Attr == "innerHTML": | |
L_result = ["%s"%img_src for img_src in L_result] | |
elif Attr == "origin": | |
L_result = L_result | |
else: | |
L_result = [img_src.get(Attr) for img_src in L_result] | |
list_L_result.append(L_result) | |
return list_L_result | |
def get_first_element_of_list_if_have(self,L): | |
if len(L): | |
return L[0] | |
return '' | |
# listCSSSelector = [ | |
# ".topic_img", | |
# ".topic-box a", | |
# ".topic-box a" | |
# ] | |
# listeAttr = [ | |
# "src", | |
# "innerText", | |
# "href" | |
# ] | |
# L0,L1,L2 = string_Interact1.extractListTextByCSSSelector(data,listCSSSelector,listeAttr) | |
def gen_slug(self,title): | |
slug=self.convert(title).replace(' ','-').replace(':','').replace('.','').replace(',','').lower() | |
slug=slug.replace('--','-') | |
slug = slugify(slug) | |
return slug | |
def url_decode(self,href): | |
o = parse_qs(href) | |
return o | |
def remove_all_a_tag_in_html(self,ndung): | |
list_a_tag=self.get_element_by_css_selector(ndung,'a') | |
for a_tag in list_a_tag: | |
ndung=ndung.replace('%s'%a_tag,a_tag.get_text()) | |
return ndung | |
def download_all_image_in_html_file(self, ndung, folder_save): | |
image_Interact1=Image_Interact() | |
#get all src image (direct src) | |
list_image_tag=self.regex_many_value(r'<img(.*?)>',ndung) | |
list_image_src=[] | |
for image_tag in list_image_tag: | |
src=self.regex_one_value(r'src="(.*?)"',image_tag) | |
data_original=self.regex_one_value(r'data-original="(.*?)"',image_tag) | |
if data_original: | |
real_source=data_original | |
#replace src anh gif => url that cua anh | |
image_tag_new=image_tag.replace(src, real_source) | |
ndung=ndung.replace(image_tag,image_tag_new) | |
else: | |
real_source=src | |
list_image_src.append(real_source) | |
#download image | |
list_local_image=[] | |
for image_src in list_image_src: | |
nameFile='%s-%s'%(int(time.time()),random.randint(0,1000)) | |
image_Interact1.downloadImg(image_src,nameFile,folder_save) | |
list_local_image.append(nameFile+'.jpg') | |
#replace link image to local image | |
for i in range(len(list_image_src)): | |
ndung=ndung.replace(list_image_src[i],'%s/%s'%(folder_save,list_local_image[i])) | |
return list_local_image,ndung | |
def randomFirstName(self,type='en'): | |
if type=='vi': | |
return random.choice(list_first_name_vi) | |
return random.choice(list_first_name_en) | |
def randomLastName(self,type='en'): | |
if type=='vi': | |
return random.choice(list_last_name_vi) | |
return random.choice(list_last_name_en) | |
def randomUserName(self,type='en'): | |
firstname = self.randomFirstName(type) | |
lastname = self.randomLastName(type) | |
number = random.randint(1000,9999) | |
return '%s_%s_%s'%(firstname,lastname,number) | |
def randomPass(self,type='en'): | |
username=self.randomUserName(type) | |
number = random.randint(1000,9999) | |
return '%s@%s'%(username,number) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment