How to scrape data from LinkedIn using Python

LinkedIn is one of the largest professional social media websites in the world and is a good source of social media data and job data. Using web scraping you can gather these data fields for performing analysis. Using Python you can make this process smoother.

Web Scraping is a technique used to extract data from websites. It can be performed in two ways:

Direct HTTP requests: The best choice for static websites.
Driving a Web Browser: The best choice for dynamic websites with content asynchronously loaded or IFrames.

Prerequisites for Scrapping LinkedIn:

Python (download from here )
Python Requests (download from here )
Python LXML ( install that here )

Below is the code to scrape LinkedIn with Python.

from lxml import html
import csv, os, json
import requests
#from exceptions import ValueError
from time import sleep
 
 
def linkedin_companies_parser(url):
    for i in range(5):
        try:
            headers = {
                        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
            }
            print ("Fetching :",url)     
            response = requests.get(url, headers = headers,verify=False)
            formatted_response = response.content.replace('<!--', '').replace('-->', '')
            doc = html.fromstring(formatted_response)
            datafrom_xpath = doc.xpath('//code[@id="stream-promo-top-bar-embed-id-content"]//text()')
            content_about = doc.xpath('//code[@id="stream-about-section-embed-id-content"]')
            if not content_about:
                content_about = doc.xpath('//code[@id="stream-footer-embed-id-content"]')
            if content_about:
                pass
                # json_text = content_about[0].html_content().replace('<code id="stream-footer-embed-id-content"><!--','').replace('<code id="stream-about-section-embed-id-content"><!--','').replace('--></code>','')
            
            if datafrom_xpath:
                try:
                    json_formatted_data = json.loads(datafrom_xpath[0])
                    company_name = json_formatted_data['companyName'] if 'companyName' in json_formatted_data.keys() else None
                    size = json_formatted_data['size'] if 'size' in json_formatted_data.keys() else None
                    industry = json_formatted_data['industry'] if 'industry' in json_formatted_data.keys() else None
                    description = json_formatted_data['description'] if 'description' in json_formatted_data.keys() else None
                    follower_count = json_formatted_data['followerCount'] if 'followerCount' in json_formatted_data.keys() else None
                    year_founded = json_formatted_data['yearFounded'] if 'yearFounded' in json_formatted_data.keys() else None
                    website = json_formatted_data['website'] if 'website' in json_formatted_data.keys() else None
                    type = json_formatted_data['companyType'] if 'companyType' in json_formatted_data.keys() else None
                    specialities = json_formatted_data['specialties'] if 'specialties' in json_formatted_data.keys() else None

                    if "headquarters" in json_formatted_data.keys():
                        city = json_formatted_data["headquarters"]['city'] if 'city' in json_formatted_data["headquarters"].keys() else None
                        country = json_formatted_data["headquarters"]['country'] if 'country' in json_formatted_data['headquarters'].keys() else None
                        state = json_formatted_data["headquarters"]['state'] if 'state' in json_formatted_data['headquarters'].keys() else None
                        street1 = json_formatted_data["headquarters"]['street1'] if 'street1' in json_formatted_data['headquarters'].keys() else None
                        street2 = json_formatted_data["headquarters"]['street2'] if 'street2' in json_formatted_data['headquarters'].keys() else None
                        zip = json_formatted_data["headquarters"]['zip'] if 'zip' in json_formatted_data['headquarters'].keys() else None
                        street = street1 + ', ' + street2
                    else:
                        city = None
                        country = None
                        state = None
                        street1 = None
                        street2 = None
                        street = None
                        zip = None

                    data = {
                                'company_name': company_name,
                                'size': size,
                                'industry': industry,
                                'description': description,
                                'follower_count': follower_count,
                                'founded': year_founded,
                                'website': website,
                                'type': type,
                                'specialities': specialities,
                                'city': city,
                                'country': country,
                                'state': state,
                                'street': street,
                                'zip': zip,
                                'url': url
                            }
                    return data
                except:
                    print( "cant parse page", url)

            # Retry in case of captcha or login page redirection
            if len(response.content) < 2000 or "trk=login_reg_redirect" in url:
                if response.status_code == 404:
                    print ("linkedin page not found")
                else:
                    raise ValueError('redirecting to login page or captcha found')
        except :
            print ("retrying :",url)
 
def readurls():
    companyurls = ['https://www.linkedin.com/company/company_name']
    extracted_data = []
    for url in companyurls:
        extracted_data.append(linkedin_companies_parser(url))
        f = open('data.json', 'w')
        json.dump(extracted_data, f, indent=4)
 
if __name__ == "__main__":
    readurls()

All you need to do is change the URL in this line

companyurls = ['https://www.linkedin.com/company/scrapehero']

or add more URLs separated by commas to this list.

The output will be in a file called data.json in the same directory

Python

How to scrape data from LinkedIn using Python

How to save the python program?

How to send SMS using Python

Contact

Company

Useful Links

Support

Python

How to save the python program?

How to send SMS using Python

You may also like

15 Powerful Step for Mastering JSON Parsing in Python: Boosting Data Manipulation and Validation

Introduction to Transfer Learning with Python: A Practical Guide

How to Check Type in Python

Contact

Company

Useful Links

Support

Login with your site account

Register a new account