dauuricus/youtube_search.py

## youtube_search.py
# coding: UTF-8
import urllib.request
import urllib.parse
import re
import time

words = "ひろゆき"
keywords = urllib.parse.quote(words)
target = "https://www.youtube.com/results?search_query=" + keywords
video_list_0 = []
already_list = []
total_list = []
id_cell_1 = {}
id_cell_2 = {}
video_list_0.append(target)
#with open("hiroyuki_yt_urls.list") as f:
#with open("hiroyuki_yt_5_id.list") as f:
#    video_list_0 = [s.strip() for s in f.readlines()]

def first_gether(target_url,total_list,id_cell_1,id_cell_2):
    html = urllib.request.urlopen(target_url).read()
    html_strings = html.decode()
    del(html)

    video_list = []

    title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings)
    if title is None:
        kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',html_strings)
        if kakawari is None:
            print("skip (not relevant)")
        else:
            omitlist = []
            video_ids = re.findall(r"watch\?v=(...........)", html_strings)
            if len(video_ids) > 0:
                for ind,idcode in enumerate(video_ids):
                    mmmm = re.search(rf'{idcode}',html_strings)
                    if mmmm is not None:
                        endindex = mmmm.end(0)
                        maybetitle = html_strings[endindex:(endindex + 1500)]
                        kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',maybetitle)
                        if kakawari is None:
                            omitlist.append(video_ids[ind])

                if len(omitlist) > 0:
                    for omit in omitlist:
                        video_ids = [ x for x in video_ids if x != omit ]

            del(omitlist)
            id_data = list(set(video_ids))
            del(video_ids)
            video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
            del(id_data)

            print("skip")
    else:

        title_strings = title.group()
        kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',title_strings)
        if kakawari is None:
            print("skip (not relevant)")
            pass
        else:

            total_list.append(target_url)
            idxxx = target_url.replace("https://www.youtube.com/watch\?v=","")
            print()
            print(idxxx)
            print('@@@',title_strings)
            print(target_url)

            keyword2 = re.search(r'"channelId" content="',html_strings)
            if keyword2 is None:
                pass
            else:
                channelid = keyword2.end(0)
                print("channelId",html_strings[channelid:(channelid + 24)])
                id_cell_1[idxxx] = html_strings[channelid:(channelid + 24)]

            keyword3 = re.search(r'"datePublished" content="',html_strings)
            if keyword3 is None:
                pass
            else:
                published = keyword3.end(0)
                print("datePublished",html_strings[published:(published + 10)])
                id_cell_2[idxxx] = html_strings[published:(published + 10)]

            keyword4 = re.search(r'"uploadDate" content="',html_strings)
            if keyword4 is None:
                pass
            else:
                uploaded = keyword4.end(0)
                print("uploadDate",html_strings[uploaded:(uploaded + 10)])
            omitlist = []
            video_ids = re.findall(r"watch\?v=(...........)", html_strings)
            if len(video_ids) > 0:
                for ind,idcode in enumerate(video_ids):
                    mmmm = re.search(rf'{idcode}',html_strings)
                    if mmmm is not None:
                        endindex = mmmm.end(0)
                        maybetitle = html_strings[endindex:(endindex + 1500)]
                        kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',maybetitle)
                        if kakawari is None:
                            omitlist.append(video_ids[ind])

                if len(omitlist) > 0:
                    temp = []
                    for omit in omitlist:
                        for x in video_ids:
                            if (x != omit):
                               temp.append(x)
                    video_ids = temp
                        #video_ids = [ x for x in video_ids if x != omit ]

            del(omitlist)
            id_data = list(set(video_ids))
            del(video_ids)
            print(len(id_data))
            video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
            del(id_data)

    return video_list,total_list,id_cell_1,id_cell_2

def second_gether(x,url_list,already_list,total_list,id_cell_1,id_cell_2):
    if len(url_list) > 0:
        sum_list = []
        rem = []
        id_cell_list = []
        b_flag = True

        for i,url in enumerate(url_list):
            if not re.match(r'^http',url):
#            if not re.match(r'^@',url):
#                continue
                url = "https://www.youtube.com/watch\?v=" + url
            print()
            print(i + 1)
            print(url)
#            videoid = url.replace('@ ',"")
            videoid = url.replace("https://www.youtube.com/watch\?v=","")
#            url = url.replace('@ ',"https://www.youtube.com/watch\?v=")
            if videoid in id_cell_1:
                print(x,';',(i + 1),';',"skip  ")
                continue

            if url in already_list:
                #print(x,';',(i + 1),';',"skip  ")
                print("skip (not relevant)")
                rem.append(url)
                continue
            #print(x,';',(i + 1),';',url)
            try:
                video_list,total_list,id_cell_1,id_cell_2 = first_gether(url,total_list,id_cell_1,id_cell_2)
            except:
                time.sleep(1)
                continue
            time.sleep(1)
            already_list.append(url)
# write file
            if (videoid in id_cell_1) and (videoid in id_cell_2):
                id_cell_list.append([(videoid,id_cell_1[videoid],id_cell_2[videoid])])
                with open("hiro_yt1.txt",mode="a") as out:
                    try:
                        out.write("vi_id:" + videoid + " ch_id:" + id_cell_1[videoid] + " pu_da:" + str(id_cell_2[videoid]) + "\n")
                    except:
                        print("write error")
                    out.close()

            print()
            for xxx, id_cell_data in enumerate(id_cell_list):
                print(i, xxx + 1, *id_cell_data)

            if ( len(video_list) > 0 ):
                video_list = list(set(video_list))
                sum_list.extend(video_list)

            if i == 30000 :
                b_flag = False
                break

        total_list = list(set(total_list))
        if ( len(rem) > 0 ):
            for remove in rem:
                if remove in sum_list:
                    inum = sum_list.index(remove)
                    sum_list.pop(inum)

        new_list = sorted(list(set(sum_list)),key=sum_list.index)

        x = x + 1
        if ( x < 300 and b_flag ):
            video_list,already_list,total_list,id_cell_1,id_cell_2 = second_gether(x,new_list,already_list,total_list,id_cell_1,id_cell_2)
    return video_list,already_list,total_list,id_cell_1,id_cell_2

counter_x = 0
video_list_0,already_list,total_list,id_cell_1,id_cell_2 = second_gether(counter_x,video_list_0,already_list,total_list,id_cell_1,id_cell_2)

del(video_list_0,already_list)

###for ind,allurl in enumerate(total_list):
###    print(ind,allurl)
#with open("hiroyuki.txt","w") as out:
#
#    counter_i = 0
#    for k,v in id_cell_1.items():
#        counter_i = counter_i + 1
#        out.write(str(counter_i) + " vi_id: " + k + "\n")
#        out.write(str(counter_i) + " pu_da: " + id_cell_2[k] + "\n")
#        out.write(str(counter_i) + " ch_id: " + v + "\n")
#    out.close()

exit()
################################################
	# coding: UTF-8
	import urllib.request
	import urllib.parse
	import re
	import time

	words = "ひろゆき"
	keywords = urllib.parse.quote(words)
	target = "https://www.youtube.com/results?search_query=" + keywords
	video_list_0 = []
	already_list = []
	total_list = []
	id_cell_1 = {}
	id_cell_2 = {}
	video_list_0.append(target)
	#with open("hiroyuki_yt_urls.list") as f:
	#with open("hiroyuki_yt_5_id.list") as f:
	# video_list_0 = [s.strip() for s in f.readlines()]

	def first_gether(target_url,total_list,id_cell_1,id_cell_2):
	html = urllib.request.urlopen(target_url).read()
	html_strings = html.decode()
	del(html)

	video_list = []

	title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings)
	if title is None:
	kakawari = re.search(r'ひろゆき\|hiroyuki\|西村博之',html_strings)
	if kakawari is None:
	print("skip (not relevant)")
	else:
	omitlist = []
	video_ids = re.findall(r"watch\?v=(...........)", html_strings)
	if len(video_ids) > 0:
	for ind,idcode in enumerate(video_ids):
	mmmm = re.search(rf'{idcode}',html_strings)
	if mmmm is not None:
	endindex = mmmm.end(0)
	maybetitle = html_strings[endindex:(endindex + 1500)]
	kakawari = re.search(r'ひろゆき\|hiroyuki\|西村博之',maybetitle)
	if kakawari is None:
	omitlist.append(video_ids[ind])

	if len(omitlist) > 0:
	for omit in omitlist:
	video_ids = [ x for x in video_ids if x != omit ]

	del(omitlist)
	id_data = list(set(video_ids))
	del(video_ids)
	video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
	del(id_data)

	print("skip")
	else:

	title_strings = title.group()
	kakawari = re.search(r'ひろゆき\|hiroyuki\|西村博之',title_strings)
	if kakawari is None:
	print("skip (not relevant)")
	pass
	else:

	total_list.append(target_url)
	idxxx = target_url.replace("https://www.youtube.com/watch\?v=","")
	print()
	print(idxxx)
	print('@@@',title_strings)
	print(target_url)

	keyword2 = re.search(r'"channelId" content="',html_strings)
	if keyword2 is None:
	pass
	else:
	channelid = keyword2.end(0)
	print("channelId",html_strings[channelid:(channelid + 24)])
	id_cell_1[idxxx] = html_strings[channelid:(channelid + 24)]

	keyword3 = re.search(r'"datePublished" content="',html_strings)
	if keyword3 is None:
	pass
	else:
	published = keyword3.end(0)
	print("datePublished",html_strings[published:(published + 10)])
	id_cell_2[idxxx] = html_strings[published:(published + 10)]

	keyword4 = re.search(r'"uploadDate" content="',html_strings)
	if keyword4 is None:
	pass
	else:
	uploaded = keyword4.end(0)
	print("uploadDate",html_strings[uploaded:(uploaded + 10)])
	omitlist = []
	video_ids = re.findall(r"watch\?v=(...........)", html_strings)
	if len(video_ids) > 0:
	for ind,idcode in enumerate(video_ids):
	mmmm = re.search(rf'{idcode}',html_strings)
	if mmmm is not None:
	endindex = mmmm.end(0)
	maybetitle = html_strings[endindex:(endindex + 1500)]
	kakawari = re.search(r'ひろゆき\|hiroyuki\|西村博之',maybetitle)
	if kakawari is None:
	omitlist.append(video_ids[ind])

	if len(omitlist) > 0:
	temp = []
	for omit in omitlist:
	for x in video_ids:
	if (x != omit):
	temp.append(x)
	video_ids = temp
	#video_ids = [ x for x in video_ids if x != omit ]

	del(omitlist)
	id_data = list(set(video_ids))
	del(video_ids)
	print(len(id_data))
	video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
	del(id_data)

	return video_list,total_list,id_cell_1,id_cell_2

	def second_gether(x,url_list,already_list,total_list,id_cell_1,id_cell_2):
	if len(url_list) > 0:
	sum_list = []
	rem = []
	id_cell_list = []
	b_flag = True

	for i,url in enumerate(url_list):
	if not re.match(r'^http',url):
	# if not re.match(r'^@',url):
	# continue
	url = "https://www.youtube.com/watch\?v=" + url
	print()
	print(i + 1)
	print(url)
	# videoid = url.replace('@ ',"")
	videoid = url.replace("https://www.youtube.com/watch\?v=","")
	# url = url.replace('@ ',"https://www.youtube.com/watch\?v=")
	if videoid in id_cell_1:
	print(x,';',(i + 1),';',"skip ")
	continue

	if url in already_list:
	#print(x,';',(i + 1),';',"skip ")
	print("skip (not relevant)")
	rem.append(url)
	continue
	#print(x,';',(i + 1),';',url)
	try:
	video_list,total_list,id_cell_1,id_cell_2 = first_gether(url,total_list,id_cell_1,id_cell_2)
	except:
	time.sleep(1)
	continue
	time.sleep(1)
	already_list.append(url)
	# write file
	if (videoid in id_cell_1) and (videoid in id_cell_2):
	id_cell_list.append([(videoid,id_cell_1[videoid],id_cell_2[videoid])])
	with open("hiro_yt1.txt",mode="a") as out:
	try:
	out.write("vi_id:" + videoid + " ch_id:" + id_cell_1[videoid] + " pu_da:" + str(id_cell_2[videoid]) + "\n")
	except:
	print("write error")
	out.close()

	print()
	for xxx, id_cell_data in enumerate(id_cell_list):
	print(i, xxx + 1, *id_cell_data)

	if ( len(video_list) > 0 ):
	video_list = list(set(video_list))
	sum_list.extend(video_list)

	if i == 30000 :
	b_flag = False
	break

	total_list = list(set(total_list))
	if ( len(rem) > 0 ):
	for remove in rem:
	if remove in sum_list:
	inum = sum_list.index(remove)
	sum_list.pop(inum)

	new_list = sorted(list(set(sum_list)),key=sum_list.index)

	x = x + 1
	if ( x < 300 and b_flag ):
	video_list,already_list,total_list,id_cell_1,id_cell_2 = second_gether(x,new_list,already_list,total_list,id_cell_1,id_cell_2)
	return video_list,already_list,total_list,id_cell_1,id_cell_2

	counter_x = 0
	video_list_0,already_list,total_list,id_cell_1,id_cell_2 = second_gether(counter_x,video_list_0,already_list,total_list,id_cell_1,id_cell_2)

	del(video_list_0,already_list)

	###for ind,allurl in enumerate(total_list):
	### print(ind,allurl)
	#with open("hiroyuki.txt","w") as out:
	#
	# counter_i = 0
	# for k,v in id_cell_1.items():
	# counter_i = counter_i + 1
	# out.write(str(counter_i) + " vi_id: " + k + "\n")
	# out.write(str(counter_i) + " pu_da: " + id_cell_2[k] + "\n")
	# out.write(str(counter_i) + " ch_id: " + v + "\n")
	# out.close()

	exit()
	################################################