diff --git a/examples/people-to-csv.py b/examples/people-to-csv.py new file mode 100644 index 0000000..5dea241 --- /dev/null +++ b/examples/people-to-csv.py @@ -0,0 +1,25 @@ +"""Example to scrape a list of Companies, and put overviews in csv form""" + +from scrape_linkedin import CompanyScraper +import pandas as pd + +# LIST YOUR COMPANIES HERE +my_company_list = [ + 'facebook', 'mit-sloan-school-of-management', 'linkedin', + 'harvard-university' +] + +company_data = [] + +with CompanyScraper() as scraper: + # Get each company's overview, add to company_data list + for name in my_company_list: + sc = scraper.scrape(company=name, people=True) + overview = sc.overview + overview['company_name'] = name + overview['people'] = sc.people + company_data.append(overview) + +# Turn into dataframe for easy csv output +df = pd.DataFrame(company_data) +df.to_csv('out.csv', index=False) diff --git a/scrape_linkedin/Company.py b/scrape_linkedin/Company.py index 5b9f9ab..b76d9bc 100644 --- a/scrape_linkedin/Company.py +++ b/scrape_linkedin/Company.py @@ -40,7 +40,8 @@ def get_company_metadata(about_section): elif child.name == 'dd': content = child.get_text().strip() results[curr_header].append( - RE_DUPLICATE_WHITESPACE.sub(" ", content)) # strip redundant whitespace + RE_DUPLICATE_WHITESPACE.sub( + " ", content)) # strip redundant whitespace for r in results: results[r] = '\n'.join(results[r]) @@ -58,16 +59,18 @@ def get_employee_count(s: str) -> Optional[int]: class Company(ResultsObject): """Linkedin User Profile Object""" - attributes = ['overview', 'jobs', 'life', 'insights'] + attributes = ['overview', 'jobs', 'life', 'insights', 'people'] + # KD adds insights attribute - def __init__(self, overview, jobs, life, insights): + def __init__(self, overview, jobs, life, insights, people): # KD fixed attributes making jobs and life undefined as they are defined in CompanyScraper, and this allows insights to work self.overview_soup = BeautifulSoup(overview, 'html.parser') self.jobs_soup = BeautifulSoup(jobs, 'html.parser') self.life_soup = BeautifulSoup(life, 'html.parser') self.insights_soup = BeautifulSoup(insights, 'html.parser') # KD adds insights soup + self.people_soup = BeautifulSoup(people, 'html.parser') @property def overview(self): @@ -78,12 +81,12 @@ def overview(self): "image": None, "name": None, "num_employees": None, + "num_followers": None, "metadata": None } # Banner containing company Name + Location - banner = one_or_default( - self.overview_soup, '.org-top-card') + banner = one_or_default(self.overview_soup, '.org-top-card') # Main container with company overview info container = one_or_default(self.overview_soup, @@ -92,14 +95,20 @@ def overview(self): overview["name"] = text_or_default(self.overview_soup, "#main h1") overview['description'] = text_or_default(container, 'section > p') - logo_image_tag = one_or_default( - banner, '.org-top-card-primary-content__logo') + banner_desp = text_or_default(banner, + '.org-top-card-summary-info-list') + num_followers = banner_desp.split(" ")[-2].strip() + + overview["num_followers"] = num_followers + + logo_image_tag = one_or_default(banner, + '.org-top-card-primary-content__logo') overview['image'] = logo_image_tag['src'] if logo_image_tag else '' company_metadata = get_company_metadata(container) overview["metadata"] = company_metadata - overview["num_employees"] = get_employee_count(company_metadata.get( - COMPANY_SIZE_KEY, "")) + overview["num_employees"] = get_employee_count( + company_metadata.get(COMPANY_SIZE_KEY, "")) return overview @@ -116,15 +125,24 @@ def life(self): def insights(self): # summary table containing the Insights data for % change in headcount at 6m, 1y and 2y - table = one_or_default( - self.insights_soup, '.org-insights-module__summary-table') + table = one_or_default(self.insights_soup, + '.org-insights-module__summary-table') insights = {} - insights.update(get_info(table, { - '6m change': 'td:nth-of-type(2) span:nth-of-type(3)', - '1y change': 'td:nth-of-type(3) span:nth-of-type(3)', - '2y change': 'td:nth-of-type(4) span:nth-of-type(3)' - - })) + insights.update( + get_info( + table, { + '6m change': 'td:nth-of-type(2) span:nth-of-type(3)', + '1y change': 'td:nth-of-type(3) span:nth-of-type(3)', + '2y change': 'td:nth-of-type(4) span:nth-of-type(3)' + })) return insights + + @property + def people(self): + content = one_or_default(self.people_soup, + '.org-grid__content-height-enforcer') + people = text_or_default(content, 'div > div > div > h2') + people = people.replace("employees", "").replace("alumni", "").strip() + return people diff --git a/scrape_linkedin/CompanyScraper.py b/scrape_linkedin/CompanyScraper.py index 46f9ffb..e86753f 100644 --- a/scrape_linkedin/CompanyScraper.py +++ b/scrape_linkedin/CompanyScraper.py @@ -1,5 +1,5 @@ import logging - +import time from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC @@ -13,13 +13,30 @@ class CompanyScraper(Scraper): - def scrape(self, company, overview=True, jobs=False, life=False, insights=False): - self.url = 'https://www.linkedin.com/company/{}'.format(company) + + def scrape(self, + company, + org_type="company", + overview=True, + jobs=False, + life=False, + insights=False, + people=False): + + # org_type = "company" or "school" + # This will allow to switch between school and company urls + # The underlying functionality is same for both scrapers + # Added parameters - org_type, people + # people page for company is same as alumni for org_type="school" + # people page for company shows employees data whereas for school it shows alumni data + + self.url = 'https://www.linkedin.com/{org_type}/{company}'.format( + org_type=org_type, company=company) self.company = company self.load_initial() - jobs_html = life_html = insights_html = overview_html = '' + people_html = jobs_html = life_html = insights_html = overview_html = '' if overview: overview_html = self.fetch_page_html('about') @@ -29,14 +46,27 @@ def scrape(self, company, overview=True, jobs=False, life=False, insights=False) jobs_html = self.fetch_page_html('jobs') if insights: insights_html = self.fetch_page_html('insights') - return Company(overview_html, jobs_html, life_html, insights_html) + if people: + people_html = self.fetch_page_html('people') + + return Company(overview_html, jobs_html, life_html, insights_html, + people_html) def fetch_page_html(self, page): """ Navigates to a company subpage and returns the entire HTML contents of the page. """ + + if page == "people": + interval = 2.0 + else: + interval = 0.1 + try: self.driver.get(f"{self.url}/{page}") + # people/alumni javascript takes more time to load + time.sleep(interval) + return self.driver.find_element_by_css_selector( '.organization-outlet').get_attribute('outerHTML') except Exception as e: @@ -47,20 +77,22 @@ def fetch_page_html(self, page): def load_initial(self): self.driver.get(self.url) try: - myElem = WebDriverWait(self.driver, self.timeout).until(AnyEC( - EC.presence_of_element_located( - (By.CSS_SELECTOR, '.organization-outlet')), - EC.presence_of_element_located( - (By.CSS_SELECTOR, '.error-container')) - )) + myElem = WebDriverWait(self.driver, self.timeout).until( + AnyEC( + EC.presence_of_element_located( + (By.CSS_SELECTOR, '.organization-outlet')), + EC.presence_of_element_located( + (By.CSS_SELECTOR, '.error-container')))) except TimeoutException as e: raise ValueError( """Took too long to load company. Common problems/solutions: 1. Invalid LI_AT value: ensure that yours is correct (they update frequently) - 2. Slow Internet: increase the timeout parameter in the Scraper constructor""") + 2. Slow Internet: increase the timeout parameter in the Scraper constructor""" + ) try: self.driver.find_element_by_css_selector('.organization-outlet') except: raise ValueError( - 'Company Unavailable: Company link does not match any companies on LinkedIn') + 'Company Unavailable: Company link does not match any companies on LinkedIn' + )