Web scraping

👪

HTML Tags

Tags are the heart of HTML. They define different elements on a webpage. Some common tags include:


                < h1> to < h6>: Headings, with < h1> being the largest and < h6> the smallest.
                  < p>: Paragraphs of text.
                  < a>: Hyperlinks, allowing you to navigate to other pages.
                  < img>: Images.
                  < ul>, < ol>, < li>: Lists, both unordered and ordered.

BeautifulSoup and Requests


                  import requests
                  from bs4 import BeautifulSoup
                  import pandas as pd

                  # Step 1: Send a GET request to the specified URL
                  response = requests.get(url)

                  # Step 2: Parse the HTML content of the response using BeautifulSoup
                  soup = BeautifulSoup(response.text, 'html.parser')

                  # Step 3: Save the HTML content to a text file for reference
                  with open("imdb.txt", "w", encoding="utf-8") as file:
                      file.write(str(soup))
                  print("Page content has been saved to imdb.txt")

                  # Step 4: Extract movie data from the parsed HTML and store it in a list
                  movies_data = []
                  for movie in soup.find_all('div', class_='lister-item-content'):
                      title = movie.find('a').text
                      genre = movie.find('span', class_='genre').text.strip()
                      stars = movie.find('div', class_='ipl-rating-star').find('span', class_='ipl-rating-star__rating').text
                      runtime = movie.find('span', class_='runtime').text
                      rating = movie.find('span', class_='ipl-rating-star__rating').text
                      movies_data.append([title, genre, stars, runtime, rating])

                  # Step 5: Create a Pandas DataFrame from the extracted movie data
                  df = pd.DataFrame(movies_data, columns=['Title', 'Genre', 'Stars', 'Runtime', 'Rating'])

                  # Display the resulting DataFrame
                  df

Scrapy Framework


                # Import necessary libraries
                import scrapy
                from scrapy.crawler import CrawlerProcess
                
                # Define the Spider class for IMDb data extraction
                class IMDbSpider(scrapy.Spider):
                    # Name of the spider
                    name = "imdb_spider"
                    # Starting URL(s) for the spider to crawl
                    start_urls = ["https://www.imdb.com/list/ls566941243/"]
                    # start_urls = [url]
                
                    # Parse method to extract data from the webpage
                    def parse(self, response):
                        # Iterate over each movie item on the webpage
                        for movie in response.css('div.lister-item-content'):
                            yield {
                                'title': movie.css('h3.lister-item-header a::text').get(),
                                'genre': movie.css('p.text-muted span.genre::text').get(),
                                'runtime': movie.css('p.text-muted span.runtime::text').get(),
                                'rating': movie.css('div.ipl-rating-star span.ipl-rating-star__rating::text').get(),
                            }
                # Initialize a CrawlerProcess instance with settings
                process = CrawlerProcess(settings={
                    'FEED_FORMAT': 'json',
                    'FEED_URI': 'output.json',  # This will overwrite the file every time you run the spider
                })
                
                
                # Add the IMDbSpider to the crawling process
                process.crawl(IMDbSpider)
                # Start the crawling process
                process.start()

Selenium


                from selenium import webdriver
                from bs4 import BeautifulSoup
                import pandas as pd
                
                # URL of the IMDb list
                url = "https://www.imdb.com/list/ls566941243/"
                
                # Set up Chrome options to run the browser in incognito mode
                chrome_options = webdriver.ChromeOptions()
                chrome_options.add_argument("--incognito")
                
                # Initialize the Chrome driver with the specified options
                driver = webdriver.Chrome(options=chrome_options)
                
                # Navigate to the IMDb list URL
                driver.get(url)
                
                # Wait for the page to load (adjust the wait time according to your webpage)
                driver.implicitly_wait(10)
                
                # Get the HTML content of the page after it has fully loaded
                html_content = driver.page_source
                
                # Parse the HTML content with BeautifulSoup
                soup = BeautifulSoup(html_content, 'html.parser')
                
                # Save the HTML content to a text file for reference
                with open("imdb_selenium.txt", "w", encoding="utf-8") as file:
                    file.write(str(soup))
                print("Page content has been saved to imdb_selenium.txt")
                
                # Extract movie data from the parsed HTML
                movies_data = []
                for movie in soup.find_all('div', class_='lister-item-content'):
                    title = movie.find('a').text
                    genre = movie.find('span', class_='genre').text.strip()
                    stars = movie.select_one('div.ipl-rating-star span.ipl-rating-star__rating').text
                    runtime = movie.find('span', class_='runtime').text
                    rating = movie.select_one('div.ipl-rating-star span.ipl-rating-star__rating').text
                    movies_data.append([title, genre, stars, runtime, rating])
                
                # Create a Pandas DataFrame from the collected movie data
                df = pd.DataFrame(movies_data, columns=['Title', 'Genre', 'Stars', 'Runtime', 'Rating'])
                
                # Display the resulting DataFrame
                print(df)
                
                # Close the Chrome driver
                driver.quit()

Requests and lxml

lxml is a Python library linking with C libraries libxml2 and libxslt, combining speed and XML features with a simple native Python API, similar to ElementTree but with added benefits


                  import requests
                  from lxml import html
                  import pandas as pd

                  # Define the URL
                  url = "https://www.imdb.com/list/ls566941243/"

                  # Send an HTTP request to the URL and get the response
                  response = requests.get(url)

                  # Parse the HTML content using lxml
                  tree = html.fromstring(response.content)

                  # Extract movie data from the parsed HTML
                  titles = tree.xpath('//h3[@class="lister-item-header"]/a/text()')
                  genres = [', '.join(genre.strip() for genre in genre_list.xpath(".//text()")) for genre_list in tree.xpath('//p[@class="text-muted text-small"]/span[@class="genre"]')]
                  ratings = tree.xpath('//div[@class="ipl-rating-star small"]/span[@class="ipl-rating-star__rating"]/text()')
                  runtimes = tree.xpath('//p[@class="text-muted text-small"]/span[@class="runtime"]/text()')

                  # Create a dictionary with extracted data
                  data = {
                      'Title': titles,
                      'Genre': genres,
                      'Rating': ratings,
                      'Runtime': runtimes
                  }

                  # Create a DataFrame from the dictionary
                  df = pd.DataFrame(data)

                  # Display the resulting DataFrame
                  df.head()

LangChain

LangChain Beautiful Soup LangChain Extraction


                # Resolve async issues by applying nest_asyncio
                import nest_asyncio
                nest_asyncio.apply()
                
                # Import required modules from langchain
                from langchain_openai import ChatOpenAI
                from langchain_community.document_loaders import AsyncChromiumLoader
                from langchain_community.document_transformers import BeautifulSoupTransformer
                from langchain.text_splitter import RecursiveCharacterTextSplitter
                from langchain.chains import create_extraction_chain
                
                # Define the URL
                url = "https://www.imdb.com/list/ls566941243/"
                
                # Initialize ChatOpenAI instance with OpenAI API key
                llm = ChatOpenAI(openai_api_key=MY_OPENAI_KEY)
                
                # Load HTML content using AsyncChromiumLoader
                loader = AsyncChromiumLoader([url])
                docs = loader.load()
                
                # Save the HTML content to a text file for reference
                with open("imdb_langchain_html.txt", "w", encoding="utf-8") as file:
                    file.write(str(docs[0].page_content))
                print("Page content has been saved to imdb_langchain_html.txt")
                
                # Transform the loaded HTML using BeautifulSoupTransformer
                bs_transformer = BeautifulSoupTransformer()
                docs_transformed = bs_transformer.transform_documents(
                    docs, tags_to_extract=["h3", "p"]
                )
                
                # Split the transformed documents using RecursiveCharacterTextSplitter
                splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
                splits = splitter.split_documents(docs_transformed)

Basically after obtaining the required HTML, we will ask LLM: “Hey LLM, with this HTML, please fill in the information according to the schema below.”



                # Define a JSON schema for movie data validation
schema = {
    "properties": {
        "movie_title": {"type": "string"},
        "stars": {"type": "integer"},
        "genre": {"type": "array", "items": {"type": "string"}},
        "runtime": {"type": "string"},
        "rating": {"type": "string"},
            },
            "required": ["movie_title", "stars", "genre", "runtime", "rating"],
        }

        def extract_movie_data(content: str, schema: dict):
            """
            Extract movie data from content using a specified JSON schema.

            Parameters:
            - content (str): Text content containing movie data.
            - schema (dict): JSON schema for validating the movie data.

            Returns:
            - dict: Extracted movie data.
            """
            # Run the extraction chain with the provided schema and content
            start_time = time.time()
            extracted_content = create_extraction_chain(schema=schema, llm=llm).run(content)
            end_time = time.time()

            # Log metadata and output in the Comet project for tracking purposes
            comet_llm.log_prompt(
                prompt=str(content),
                metadata= {
                    "schema": schema
                },
                output= extracted_content,
                duration= end_time - start_time,
            )

            return extracted_content

            # Extract movie data using the defined schema and the first split page content
        extracted_content = extract_movie_data(schema=schema, content=splits[0].page_content)

        # Display the extracted movie data
        extracted_content

Web scraping

Basic HTML

Structure of HTML

HTML Tags

Attributes

The RAG process involves the following steps:

Milestones:

How to Inspect Elements in a Website

BeautifulSoup and Requests

Scrapy Framework

Selenium

Requests and lxml

LangChain

Challenges