diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py index 77900eb..ce77e10 100644 --- a/linkedin_scraper/company.py +++ b/linkedin_scraper/company.py @@ -48,7 +48,7 @@ class Company(Scraper): employees = [] headcount = None - def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, industry = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True, get_employees = True, close_on_complete = True): + def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, industry = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True, get_employees = True, close_on_complete = True, post_event = None): self.linkedin_url = linkedin_url self.name = name self.about_us = about_us @@ -62,6 +62,8 @@ def __init__(self, linkedin_url = None, name = None, about_us =None, website = N self.showcase_pages = showcase_pages self.affiliated_companies = affiliated_companies + self.post_event = post_event + if driver is None: try: if os.getenv("CHROMEDRIVER") == None: @@ -220,6 +222,10 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): # if num_attributes == 0: # exit() x_off = 0 + + if self.post_event and callable(self.post_event): + self.post_event(driver, cls=self, data={'labels': labels, 'values': values, 'x_off': x_off}) + for i in range(num_attributes): txt = labels[i].text.strip() if txt == 'Website': diff --git a/samples/scrape_company_with_post_process.py b/samples/scrape_company_with_post_process.py new file mode 100644 index 0000000..570ce96 --- /dev/null +++ b/samples/scrape_company_with_post_process.py @@ -0,0 +1,38 @@ +import os +import re + +from linkedin_scraper import Company, actions +from selenium import webdriver +driver = webdriver.Chrome("./chromedriver") + +Company.employees_count = None # Patch to register more company fields + + +def post_profile_parsing(driver, cls, data): + labels = data.get('labels') + values = data.get('values') + x_off = data.get('x_off') + num_attributes = min(len(labels), len(values)) + + for i in range(num_attributes): + txt = labels[i].text.strip() + if txt == 'Company size': + try: + employee_count_txt = values[i + x_off + 1].text.strip() + if 'on LinkedIn' in employee_count_txt: + cls.employees_count = re.sub(r"\D+", "", employee_count_txt, 0, re.MULTILINE) + except Exception as e: + assert e + + +email = os.getenv("LINKEDIN_USER") +password = os.getenv("LINKEDIN_PASSWORD") +actions.login(driver, email, password) # if email and password isn't given, it'll prompt in terminal + + +company = Company( + 'https://www.linkedin.com/company/national-basketball-association/', + post_event=post_profile_parsing, + get_employees=False, + driver=driver +)