I am trying to scrape Walmart’s product page. I have the following script running:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import csv
def setup_driver():
"""Set up Firefox WebDriver with appropriate options"""
options = Options()
options.add_argument("--start-maximized")
# Uncomment if you want the browser to be invisible
# options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
return driver
def solve_captcha(driver):
"""Solve the captcha by clicking and holding for 10 seconds"""
try:
# Wait for captcha element to be visible
captcha_element = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, '#px-captcha'))
)
print("CAPTCHA detected! Attempting to solve...")
# Create action chain for click and hold
action = ActionChains(driver)
action.click_and_hold(captcha_element)
action.perform()
# Hold for 12.5 seconds
time.sleep(12.5)
# Release and then click again after a short pause
action.release(captcha_element)
action.perform()
time.sleep(0.2)
action.click(captcha_element)
action.perform()
# Wait for page to load after captcha
time.sleep(5)
print("CAPTCHA solution attempt completed")
return True
except Exception as e:
print(f"No standard CAPTCHA detected or error: {str(e)}")
return False
def scrape_product_details(url, driver):
"""Scrape product details with automatic CAPTCHA solving attempt"""
driver.get(url)
# Try to solve captcha if present
solve_captcha(driver)
# Wait for product elements to load (with a longer timeout)
try:
# Using the id from the HTML you shared
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID, "main-title"))
)
except:
print("Product title not found. Checking if we need to solve CAPTCHA manually...")
# If product title not found, might be stuck at CAPTCHA
try:
if driver.find_element(By.CSS_SELECTOR, '#px-captcha'):
print("CAPTCHA is still present. Please solve it manually.")
input("Press Enter after solving the CAPTCHA...")
except:
print("Could not find CAPTCHA element or product title. Page might not be loading correctly.")
# Extract product details
product = {
'url': url,
'name': 'N/A',
'price': 'N/A',
'about': 'N/A'
}
# Multiple selectors to try for the product name
name_selectors = [
"#main-title", # Using the ID from your HTML
"[itemprop='name']", # Using the itemprop attribute
"h1[data-fs-element='name']", # Using the data-fs-element attribute
"h1.lh-copy.dark-gray.mv1.f4.mh0.b", # Using the class combination
"h1[data-seo-id='hero-carousel-image']" # Using the data-seo-id attribute
]
# Try each selector for the product name
for selector in name_selectors:
try:
name_element = driver.find_element(By.CSS_SELECTOR, selector)
product['name'] = name_element.text.strip()
print(f"Found product name using selector: {selector}")
break # Exit the loop if we found the name
except:
continue
# Try multiple selectors for price
price_selectors = [
"[data-testid='price-value']",
"[itemprop='price']",
".w_Gd.w_Gh",
".b.black.f1.ma0" # Common price class combinations
]
for selector in price_selectors:
try:
price_element = driver.find_element(By.CSS_SELECTOR, selector)
product['price'] = price_element.text.strip()
print(f"Found price using selector: {selector}")
break
except:
continue
# Try multiple selectors for about section
about_selectors = [
"#about-item-section",
"[data-testid='product-description']",
"[itemprop='description']",
".prod-ProductDetails-description-content"
]
for selector in about_selectors:
try:
about_section = driver.find_element(By.CSS_SELECTOR, selector)
product['about'] = about_section.text.strip()
print(f"Found about section using selector: {selector}")
break
except:
continue
return product
def main():
# Hardcoded URL - replace with your target product URL
url = "https://www.walmart.com/ip/Great-Value-Grade-A-Large-White-Eggs-18-count/145051970"
# Initialize WebDriver
driver = setup_driver()
try:
print(f"Scraping product details from {url}...")
product = scrape_product_details(url, driver)
# Display results
print("\nProduct Details:")
print(f"Name: {product['name']}")
print(f"Price: {product['price']}")
print(f"About: {product['about'][:100]}...") # Show first 100 chars
# Save to CSV
with open('product_details.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=product.keys())
writer.writeheader()
writer.writerow(product)
print("\nSaved to product_details.csv")
finally:
# Clean up
driver.quit()
if __name__ == "__main__":
main()
But the results won’t print inside my csv and I’m not even sure the request goes through, all I know is that my terminal has the following:
RunResponse(content="Certainly! I'll utilize the SeleniumScraperTool to extract product details from the provided Walmart product URL. Please hold on for a moment while I perform the task.", content_type='str', thinking=None, event='RunResponse', messages=[Message(role='system', content="You are ProductScraper, an agent that extracts product details (name, price, and description) from e-commerce webpages using Selenium with automatic CAPTCHA handling.\n\n<instructions>\nWhen provided with a product URL, use your SeleniumScraperTool to scrape and return the product details. Also save the results to a CSV file named 'product_details.csv'.\n</instructions>\n\n<additional_information>\n- Use markdown to format your answers.\n</additional_information>", name=None, tool_call_id=None, tool_calls=None, audio=None, images=None, videos=None, audio_output=None, thinking=None, redacted_thinking=None, provider_data=None, reasoning_content=None, tool_name=None, tool_args=None, tool_call_error=None, stop_after_tool_call=False, add_to_agent_memory=True, from_history=False, metrics=MessageMetrics(input_tokens=0, output_tokens=0, total_tokens=0, prompt_tokens=0, completion_tokens=0, prompt_tokens_details=None, completion_tokens_details=None, additional_metrics=None, time=None, time_to_first_token=None, timer=None), references=None, created_at=1742166627), Message(role='user', content='https://www.walmart.com/ip/Nike-Unisex-Everyday-Cotton-Cushioned-Crew-Training-Socks-with-DRI-FIT-Technology-White-6-Pairs/439915194?classType=VARIANT&athbdg=L1600&from=/search', name=None, tool_call_id=None, tool_calls=None, audio=None, images=None, videos=None, audio_output=None, thinking=None, redacted_thinking=None, provider_data=None, reasoning_content=None, tool_name=None, tool_args=None, tool_call_error=None, stop_after_tool_call=False, add_to_agent_memory=True, from_history=False, metrics=MessageMetrics(input_tokens=0, output_tokens=0, total_tokens=0, prompt_tokens=0, completion_tokens=0, prompt_tokens_details=None, completion_tokens_details=None, additional_metrics=None, time=None, time_to_first_token=None, timer=None), references=None, created_at=1742166627), Message(role='assistant', content="Certainly! I'll utilize the SeleniumScraperTool to extract product details from the provided Walmart product URL. Please hold on for a moment while I perform the task.", name=None, tool_call_id=None, tool_calls=None, audio=None, images=None, videos=None, audio_output=None, thinking=None, redacted_thinking=None, provider_data=None, reasoning_content=None, tool_name=None, tool_args=None, tool_call_error=None, stop_after_tool_call=False, add_to_agent_memory=True, from_history=False, metrics=MessageMetrics(input_tokens=160, output_tokens=33, total_tokens=193, prompt_tokens=160, completion_tokens=33, prompt_tokens_details={'audio_tokens': 0, 'cached_tokens': 0}, completion_tokens_details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, additional_metrics=None, time=1.4171337910011061, time_to_first_token=None, timer=<agno.utils.timer.Timer object at 0x105987050>), references=None, created_at=1742166627)], metrics={'input_tokens': [160], 'output_tokens': [33], 'total_tokens': [193], 'prompt_tokens': [160], 'completion_tokens': [33], 'prompt_tokens_details': [{'audio_tokens': 0, 'cached_tokens': 0}], 'completion_tokens_details': [{'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}], 'time': [1.4171337910011061]}, model='gpt-4o', run_id='c13e67e7-a963-4be8-b6cc-c864441a6028', agent_id='3bd8c4f4-aaba-42c0-a3c4-db746783ad25', session_id='0cf65eb8-3150-4fd7-9072-fee8f78db0b2', workflow_id=None, tools=[], images=None, videos=None, audio=None, response_audio=None, extra_data=None, created_at=1742166627)
Please anyone willing to provide any advice or bugs they see