Skip to content

Instantly share code, notes, and snippets.

@nghiahsgs
Created July 12, 2021 05:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nghiahsgs/1b7ec14f7eaff95a50bce38f9aa32c7c to your computer and use it in GitHub Desktop.
Save nghiahsgs/1b7ec14f7eaff95a50bce38f9aa32c7c to your computer and use it in GitHub Desktop.
class String_Interact():
def __init__(self):
pass
def regex_one_value(self,pattern, input_str):
regex1=re.compile(pattern)
kq=regex1.search(input_str)
if kq:
kq=kq.group(1)
else:
kq=''
return kq
def regex_many_value(self,pattern, input_str):
regex1=re.compile(pattern)
kq=regex1.findall(input_str)
return kq
def convert(self,text):
patterns = {
'[àáảãạăắằẵặẳâầấậẫẩ]': 'a',
'[đ]': 'd',
'[èéẻẽẹêềếểễệ]': 'e',
'[ìíỉĩị]': 'i',
'[òóỏõọôồốổỗộơờớởỡợ]': 'o',
'[ùúủũụưừứửữự]': 'u',
'[ỳýỷỹỵ]': 'y'
}
"""
Convert from 'Tieng Viet co dau' thanh 'Tieng Viet khong dau'
text: input string to be converted
Return: string converted
"""
output = text
for regex, replace in patterns.items():
output = re.sub(regex, replace, output)
# deal with upper case
output = re.sub(regex.upper(), replace.upper(), output)
return output
def encode_tieng_viet_html(self,string_input):
dict_char={
'à':'à',
'á':'á',
'â':'â',
'ã':'ã',
'À':'À',
'Á':'Á',
'Â':'Â',
'Ã':'Ã',
'è':'è',
'é':'é',
'ê':'ê',
'È':'È',
'É':'É',
'Ê':'Ê',
'ì':'ì',
'í':'í',
'Ì':'Ì',
'Í':'Í',
'ò':'ò',
'ó':'ó',
'ô':'ô',
'Ò':'Ò',
'Ó':'Ó',
'Ô':'Ô',
'ù':'ù',
'ú':'ú',
'Ù':'Ù',
'Ú':'Ú'
}
string_output=[]
for char in string_input:
# print(char)
if char in dict_char:
string_output+=dict_char[char]
else:
string_output+=char
return ''.join(string_output)
def md5_encode(self,text):
hash_object = hashlib.md5(text.encode())
md5_hash = hash_object.hexdigest()
return md5_hash
def keep_normal_char(self, full_string):
list_valid_char = 'qwertyuiopasdfghjklzxcvbnm'
list_valid_char += 'qwertyuiopasdfghjklzxcvbnm'.upper()
list_valid_char += '0123456789'
list_valid_char += ' '
kq = ''
for char in full_string:
if char in list_valid_char:
kq += char
return kq
def get_element_by_css_selector(self, data, css_selector):
soup = bs4.BeautifulSoup(data, 'html.parser')
eles=soup.select(css_selector)
return eles #get_text() and get('href')
def extractListTextByCSSSelector(self,data,listCSSSelector,listeAttr):
list_L_result = []
for index, _ in enumerate(listCSSSelector):
CSSSelector = listCSSSelector[index]
Attr = listeAttr[index]
L_result = self.get_element_by_css_selector(data,CSSSelector)
if Attr == "innerText":
L_result = [img_src.get_text().replace("\n","").replace("\t","").replace("\r","").strip() for img_src in L_result]
elif Attr == "innerHTML":
L_result = ["%s"%img_src for img_src in L_result]
elif Attr == "origin":
L_result = L_result
else:
L_result = [img_src.get(Attr) for img_src in L_result]
list_L_result.append(L_result)
return list_L_result
def get_first_element_of_list_if_have(self,L):
if len(L):
return L[0]
return ''
# listCSSSelector = [
# ".topic_img",
# ".topic-box a",
# ".topic-box a"
# ]
# listeAttr = [
# "src",
# "innerText",
# "href"
# ]
# L0,L1,L2 = string_Interact1.extractListTextByCSSSelector(data,listCSSSelector,listeAttr)
def gen_slug(self,title):
slug=self.convert(title).replace(' ','-').replace(':','').replace('.','').replace(',','').lower()
slug=slug.replace('--','-')
slug = slugify(slug)
return slug
def url_decode(self,href):
o = parse_qs(href)
return o
def remove_all_a_tag_in_html(self,ndung):
list_a_tag=self.get_element_by_css_selector(ndung,'a')
for a_tag in list_a_tag:
ndung=ndung.replace('%s'%a_tag,a_tag.get_text())
return ndung
def download_all_image_in_html_file(self, ndung, folder_save):
image_Interact1=Image_Interact()
#get all src image (direct src)
list_image_tag=self.regex_many_value(r'<img(.*?)>',ndung)
list_image_src=[]
for image_tag in list_image_tag:
src=self.regex_one_value(r'src="(.*?)"',image_tag)
data_original=self.regex_one_value(r'data-original="(.*?)"',image_tag)
if data_original:
real_source=data_original
#replace src anh gif => url that cua anh
image_tag_new=image_tag.replace(src, real_source)
ndung=ndung.replace(image_tag,image_tag_new)
else:
real_source=src
list_image_src.append(real_source)
#download image
list_local_image=[]
for image_src in list_image_src:
nameFile='%s-%s'%(int(time.time()),random.randint(0,1000))
image_Interact1.downloadImg(image_src,nameFile,folder_save)
list_local_image.append(nameFile+'.jpg')
#replace link image to local image
for i in range(len(list_image_src)):
ndung=ndung.replace(list_image_src[i],'%s/%s'%(folder_save,list_local_image[i]))
return list_local_image,ndung
def randomFirstName(self,type='en'):
if type=='vi':
return random.choice(list_first_name_vi)
return random.choice(list_first_name_en)
def randomLastName(self,type='en'):
if type=='vi':
return random.choice(list_last_name_vi)
return random.choice(list_last_name_en)
def randomUserName(self,type='en'):
firstname = self.randomFirstName(type)
lastname = self.randomLastName(type)
number = random.randint(1000,9999)
return '%s_%s_%s'%(firstname,lastname,number)
def randomPass(self,type='en'):
username=self.randomUserName(type)
number = random.randint(1000,9999)
return '%s@%s'%(username,number)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment