Skip to content

Instantly share code, notes, and snippets.

@JoJoseph25
Last active June 29, 2021 08:28
Show Gist options
  • Save JoJoseph25/67b3fe911430419942cf5747af8298ed to your computer and use it in GitHub Desktop.
Save JoJoseph25/67b3fe911430419942cf5747af8298ed to your computer and use it in GitHub Desktop.
Parse multiple specified YouTube video URL at the same time using parallel processing and saving to excel file.
from multiprocessing import Process, Manager
def channel2excel(url_csv, from_date, to_date):
print(url_csv,' checking started')
save_name = url_csv.split('_reverse')[0]
save_name = save_name+'_checked.xlsx'
df = pd.read_csv(url_csv)
n_threads = 18
df_split = np.array_split(df['Video URL'].values, n_threads)
df_split = [list(split) for split in df_split]
valid_url =[]
with Manager() as manager:
valid_url = manager.list()
procs = []
for i in range(n_threads):
proc = Process(target=check_url,args=(df_split[i], from_date, to_date, valid_url, i))
procs.append(proc)
proc.start()
for proc in procs:
proc.join()
valid_url = list(valid_url)
new_df = pd.DataFrame.from_records(valid_url)
new_df['upload_date'] = pd.to_datetime(new_df['upload_date'], format='%Y-%m-%d')
new_df.sort_values(by=['upload_date'], inplace=True, ascending=True)
new_df.reset_index(inplace=True)
new_df.drop(['index'],axis=1, inplace=True)
new_df.to_excel(save_name,index=False)
print('File saved at', save_name)
return new_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment