Skip to content

Instantly share code, notes, and snippets.

@wemakefuture
Last active January 3, 2019 15:50
Show Gist options
  • Save wemakefuture/8c84fa97bd56be711cf2842eb8007f70 to your computer and use it in GitHub Desktop.
Save wemakefuture/8c84fa97bd56be711cf2842eb8007f70 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd
from tabulate import tabulate
import os
import csv
f = csv.writer(open('ops.csv', 'w', encoding='utf-8'))
f.writerow(['Name', 'Link'])
pages = []
for i in range(1, 146):
url = 'https://www.kompetenznetz-mittelstand.de/de/app/account/list/public?s=IwvKhoY3u236pAafN&action=list&page=' + str(i) + '&item_count=200'
pages.append(url)
for item in pages:
page = requests.get(item)
soup = BeautifulSoup(page.text, 'html.parser')
img_tag = soup.img
soup.img.decompose()
img_tag
frist_links = soup.find(class_='sf-nav sf-nav-tabmenu sf-hide-print')
frist_links.decompose()
left_links = soup.find(class_='left')
left_links.decompose()
last_links = soup.find(class_='sf-btn-group left')
last_links.decompose()
last_links2 = soup.find(class_='sf-btn-group')
last_links2.decompose()
company_list = soup.find(class_='sfsDialogContent')
company_item_list = company_list.find_all('a')
for company_name in company_item_list:
names = company_name.contents[0]
links = 'https://www.kompetenznetz-mittelstand.de' + company_name.get('href')
f.writerow([names, links])
writer = csv.writer(open('correct.csv', 'w', encoding='utf-8'))
for row in csv.reader('ops.csv'):
if not row[0].startswith('<img alt'):
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment