I am trying to scrape a product webpage, does not work, not even sure it runs

AlazarManakelew · March 16, 2025, 11:14pm

I am trying to scrape Walmart’s product page. I have the following script running:

from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import csv

def setup_driver():
    """Set up Firefox WebDriver with appropriate options"""
    options = Options()
    options.add_argument("--start-maximized")
    # Uncomment if you want the browser to be invisible
    # options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    return driver

def solve_captcha(driver):
    """Solve the captcha by clicking and holding for 10 seconds"""
    try:
        # Wait for captcha element to be visible
        captcha_element = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, '#px-captcha'))
        )
        
        print("CAPTCHA detected! Attempting to solve...")
        
        # Create action chain for click and hold
        action = ActionChains(driver)
        action.click_and_hold(captcha_element)
        action.perform()
        
        # Hold for 12.5 seconds
        time.sleep(12.5)
        
        # Release and then click again after a short pause
        action.release(captcha_element)
        action.perform()
        time.sleep(0.2)
        action.click(captcha_element)
        action.perform()
        
        # Wait for page to load after captcha
        time.sleep(5)
        
        print("CAPTCHA solution attempt completed")
        return True
        
    except Exception as e:
        print(f"No standard CAPTCHA detected or error: {str(e)}")
        return False

def scrape_product_details(url, driver):
    """Scrape product details with automatic CAPTCHA solving attempt"""
    driver.get(url)
    
    # Try to solve captcha if present
    solve_captcha(driver)
    
    # Wait for product elements to load (with a longer timeout)
    try:
        # Using the id from the HTML you shared
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID, "main-title"))
        )
    except:
        print("Product title not found. Checking if we need to solve CAPTCHA manually...")
        # If product title not found, might be stuck at CAPTCHA
        try:
            if driver.find_element(By.CSS_SELECTOR, '#px-captcha'):
                print("CAPTCHA is still present. Please solve it manually.")
                input("Press Enter after solving the CAPTCHA...")
        except:
            print("Could not find CAPTCHA element or product title. Page might not be loading correctly.")
    
    # Extract product details
    product = {
        'url': url,
        'name': 'N/A',
        'price': 'N/A',
        'about': 'N/A'
    }
    
    # Multiple selectors to try for the product name
    name_selectors = [
        "#main-title",                              # Using the ID from your HTML
        "[itemprop='name']",                        # Using the itemprop attribute
        "h1[data-fs-element='name']",               # Using the data-fs-element attribute
        "h1.lh-copy.dark-gray.mv1.f4.mh0.b",        # Using the class combination
        "h1[data-seo-id='hero-carousel-image']"     # Using the data-seo-id attribute
    ]
    
    # Try each selector for the product name
    for selector in name_selectors:
        try:
            name_element = driver.find_element(By.CSS_SELECTOR, selector)
            product['name'] = name_element.text.strip()
            print(f"Found product name using selector: {selector}")
            break  # Exit the loop if we found the name
        except:
            continue
    
    # Try multiple selectors for price
    price_selectors = [
        "[data-testid='price-value']",
        "[itemprop='price']",
        ".w_Gd.w_Gh",
        ".b.black.f1.ma0"  # Common price class combinations
    ]
    
    for selector in price_selectors:
        try:
            price_element = driver.find_element(By.CSS_SELECTOR, selector)
            product['price'] = price_element.text.strip()
            print(f"Found price using selector: {selector}")
            break
        except:
            continue
    
    # Try multiple selectors for about section
    about_selectors = [
        "#about-item-section",
        "[data-testid='product-description']",
        "[itemprop='description']",
        ".prod-ProductDetails-description-content"
    ]
    
    for selector in about_selectors:
        try:
            about_section = driver.find_element(By.CSS_SELECTOR, selector)
            product['about'] = about_section.text.strip()
            print(f"Found about section using selector: {selector}")
            break
        except:
            continue
    
    return product

def main():
    # Hardcoded URL - replace with your target product URL
    url = "https://www.walmart.com/ip/Great-Value-Grade-A-Large-White-Eggs-18-count/145051970"
    
    # Initialize WebDriver
    driver = setup_driver()
    
    try:
        print(f"Scraping product details from {url}...")
        product = scrape_product_details(url, driver)
        
        # Display results
        print("\nProduct Details:")
        print(f"Name: {product['name']}")
        print(f"Price: {product['price']}")
        print(f"About: {product['about'][:100]}...")  # Show first 100 chars
        
        # Save to CSV
        with open('product_details.csv', 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=product.keys())
            writer.writeheader()
            writer.writerow(product)
        
        print("\nSaved to product_details.csv")
        
    finally:
        # Clean up
        driver.quit()

if __name__ == "__main__":
    main()

But the results won’t print inside my csv and I’m not even sure the request goes through, all I know is that my terminal has the following:

RunResponse(content="Certainly! I'll utilize the SeleniumScraperTool to extract product details from the provided Walmart product URL. Please hold on for a moment while I perform the task.", content_type='str', thinking=None, event='RunResponse', messages=[Message(role='system', content="You are ProductScraper, an agent that extracts product details (name, price, and description) from e-commerce webpages using Selenium with automatic CAPTCHA handling.\n\n<instructions>\nWhen provided with a product URL, use your SeleniumScraperTool to scrape and return the product details. Also save the results to a CSV file named 'product_details.csv'.\n</instructions>\n\n<additional_information>\n- Use markdown to format your answers.\n</additional_information>", name=None, tool_call_id=None, tool_calls=None, audio=None, images=None, videos=None, audio_output=None, thinking=None, redacted_thinking=None, provider_data=None, reasoning_content=None, tool_name=None, tool_args=None, tool_call_error=None, stop_after_tool_call=False, add_to_agent_memory=True, from_history=False, metrics=MessageMetrics(input_tokens=0, output_tokens=0, total_tokens=0, prompt_tokens=0, completion_tokens=0, prompt_tokens_details=None, completion_tokens_details=None, additional_metrics=None, time=None, time_to_first_token=None, timer=None), references=None, created_at=1742166627), Message(role='user', content='https://www.walmart.com/ip/Nike-Unisex-Everyday-Cotton-Cushioned-Crew-Training-Socks-with-DRI-FIT-Technology-White-6-Pairs/439915194?classType=VARIANT&athbdg=L1600&from=/search', name=None, tool_call_id=None, tool_calls=None, audio=None, images=None, videos=None, audio_output=None, thinking=None, redacted_thinking=None, provider_data=None, reasoning_content=None, tool_name=None, tool_args=None, tool_call_error=None, stop_after_tool_call=False, add_to_agent_memory=True, from_history=False, metrics=MessageMetrics(input_tokens=0, output_tokens=0, total_tokens=0, prompt_tokens=0, completion_tokens=0, prompt_tokens_details=None, completion_tokens_details=None, additional_metrics=None, time=None, time_to_first_token=None, timer=None), references=None, created_at=1742166627), Message(role='assistant', content="Certainly! I'll utilize the SeleniumScraperTool to extract product details from the provided Walmart product URL. Please hold on for a moment while I perform the task.", name=None, tool_call_id=None, tool_calls=None, audio=None, images=None, videos=None, audio_output=None, thinking=None, redacted_thinking=None, provider_data=None, reasoning_content=None, tool_name=None, tool_args=None, tool_call_error=None, stop_after_tool_call=False, add_to_agent_memory=True, from_history=False, metrics=MessageMetrics(input_tokens=160, output_tokens=33, total_tokens=193, prompt_tokens=160, completion_tokens=33, prompt_tokens_details={'audio_tokens': 0, 'cached_tokens': 0}, completion_tokens_details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, additional_metrics=None, time=1.4171337910011061, time_to_first_token=None, timer=<agno.utils.timer.Timer object at 0x105987050>), references=None, created_at=1742166627)], metrics={'input_tokens': [160], 'output_tokens': [33], 'total_tokens': [193], 'prompt_tokens': [160], 'completion_tokens': [33], 'prompt_tokens_details': [{'audio_tokens': 0, 'cached_tokens': 0}], 'completion_tokens_details': [{'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}], 'time': [1.4171337910011061]}, model='gpt-4o', run_id='c13e67e7-a963-4be8-b6cc-c864441a6028', agent_id='3bd8c4f4-aaba-42c0-a3c4-db746783ad25', session_id='0cf65eb8-3150-4fd7-9072-fee8f78db0b2', workflow_id=None, tools=[], images=None, videos=None, audio=None, response_audio=None, extra_data=None, created_at=1742166627)

Please anyone willing to provide any advice or bugs they see

Monali · March 18, 2025, 2:42am

Hi @AlazarManakelew
Thanks for reaching out and for using Agno! I’ve looped in the right engineers to help with your question. We usually respond within 48 hours, but if this is urgent, just let us know, and we’ll do our best to prioritize it.
Appreciate your patience—we’ll get back to you soon!

WillemdeJongh1 · March 19, 2025, 2:24pm

Hey @AlazarManakelew

Thanks for raising. Is the code snippet you attached all of your code?
The output shows agno outputs, but I dont see you actually using any of our agents in that script. Kindly provide us the rest of your code and we can try to replicate.

AlazarManakelew · March 20, 2025, 2:42am

Sorry, but I figured it out. Thanks for the effort however!

Topic		Replies	Views
Tools not working General agent	11	273	February 17, 2025
I tried using a custom tool, but when I ran it, it only returned some errors General agent , tool-call , bug	7	81	April 8, 2025
Team calling 2 agents(RAG and web search) , but no context is being returned General agent , knowledge , rag	2	76	March 12, 2025
The agent fails to return results after calling multiple tools General tool-call	2	19	June 13, 2025
Duckduckgo_search rate limit issue General tool-call	6	247	April 22, 2025

I am trying to scrape a product webpage, does not work, not even sure it runs

Related topics