Refactor Playwright scraping with environment-specific browser and

Queue-It handling
mailing
Razmig Sarkissian 2 months ago
parent a7cbf4c6a6
commit 2c47025a77
  1. 415
      api/utils.py

@ -6,6 +6,8 @@ from datetime import datetime, timedelta
import json
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from django.conf import settings
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
@ -32,7 +34,7 @@ def scrape_fft_club_tournaments(
logger.info(f"Starting Playwright scraping for {club_name}")
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
browser = get_browser_for_environment(p)
page_obj = browser.new_page()
page_obj.set_extra_http_headers(
@ -43,10 +45,10 @@ def scrape_fft_club_tournaments(
# Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois"
logger.info(f"Navigating to: {target_url}")
# logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url)
# page_obj.wait_for_timeout(7000)
page_obj.wait_for_timeout(2000)
current_url = page_obj.url
logger.info(f"Current URL: {current_url}")
@ -64,7 +66,7 @@ def scrape_fft_club_tournaments(
return None
form_build_id = form_input.get_attribute("value")
logger.info(f"Extracted form_build_id: {form_build_id}")
# logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters
date_component = ""
@ -83,7 +85,7 @@ def scrape_fft_club_tournaments(
params = f"recherche_type=club&club[autocomplete][value_container][value_field]={club_code_clean}&club[autocomplete][value_container][label_field]={club_name_encoded}&pratique=PADEL{date_component}&page={page}&sort=dateDebut+asc&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
logger.info(f"AJAX Parameters: {params}")
# logger.info(f"AJAX Parameters: {params}")
# Make AJAX request and capture the full response
ajax_script = f"""
@ -118,24 +120,24 @@ def scrape_fft_club_tournaments(
}}
"""
logger.info("Making AJAX request...")
# logger.info("Making AJAX request...")
result = page_obj.evaluate(ajax_script)
browser.close()
# Print the full response for debugging
logger.info(f"AJAX Response Status: {result.get('status')}")
logger.info(f"AJAX Response Success: {result.get('success')}")
# logger.info(f"AJAX Response Status: {result.get('status')}")
# logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get("success"):
response_text = result.get("responseText", "")
logger.info(f"Raw Response Length: {len(response_text)}")
# logger.info(f"Raw Response Length: {len(response_text)}")
# logger.info(f"Raw Response (first 500 chars): {response_text[:500]}")
try:
# Try to parse as JSON
json_data = json.loads(response_text)
logger.info(f"JSON Response Type: {type(json_data)}")
# logger.info(f"JSON Response Type: {type(json_data)}")
# logger.info(f"JSON Response: {json.dumps(json_data, indent=2, default=str)}")
# Now try to parse it
@ -164,7 +166,7 @@ def scrape_fft_club_tournaments_all_pages(
"""
Scrapes all pages of FFT tournaments for a specific club
"""
logger.info(f"Starting complete tournament scraping for {club_name}")
# logger.info(f"Starting complete tournament scraping for {club_name}")
all_tournaments = []
page = 0
@ -237,7 +239,7 @@ def _parse_ajax_response(commands):
# Check for alert commands (maintenance mode)
for command in commands:
if command.get("command") == "alert":
logger.warning("Maintenance mode detected")
# logger.warning("Maintenance mode detected")
return None
# Find the command with results
@ -245,7 +247,7 @@ def _parse_ajax_response(commands):
for command in commands:
if command.get("command") == "recherche_tournois_update":
result_command = command
logger.info("Found recherche_tournois_update command!")
# logger.info("Found recherche_tournois_update command!")
break
if result_command and result_command.get("results"):
@ -253,7 +255,7 @@ def _parse_ajax_response(commands):
items = results.get("items", [])
total_results = results.get("nb_results", 0)
logger.info(f"Processing {len(items)} tournaments from results")
# logger.info(f"Processing {len(items)} tournaments from results")
for item in items:
# Parse dates - they're already in the correct format
@ -331,9 +333,9 @@ def _parse_ajax_response(commands):
tournaments.append(tournament)
logger.info(
f"Successfully parsed {len(tournaments)} tournaments from response"
)
# logger.info(
# f"Successfully parsed {len(tournaments)} tournaments from response"
# )
return {
"tournaments": tournaments,
"total_results": total_results,
@ -371,7 +373,7 @@ def scrape_fft_all_tournaments(
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
browser = get_browser_for_environment(p)
page_obj = browser.new_page()
page_obj.set_extra_http_headers(
@ -382,28 +384,28 @@ def scrape_fft_all_tournaments(
# Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois"
logger.info(f"Navigating to: {target_url}")
# logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url)
# page_obj.wait_for_timeout(7000)
page_obj.wait_for_timeout(500)
current_url = page_obj.url
logger.info(f"Current URL: {current_url}")
# logger.info(f"Current URL: {current_url}")
if "queue-it.net" in current_url.lower():
logger.warning("Still in Queue-It")
# logger.warning("Still in Queue-It")
browser.close()
return None
# Extract form_build_id
form_input = page_obj.query_selector('input[name="form_build_id"]')
if not form_input:
logger.error("Could not find form_build_id")
# logger.error("Could not find form_build_id")
browser.close()
return None
form_build_id = form_input.get_attribute("value")
logger.info(f"Extracted form_build_id: {form_build_id}")
# logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters - EXACT same pattern as club function
date_component = ""
@ -422,32 +424,32 @@ def scrape_fft_all_tournaments(
# Add categories filter
if categories:
logger.info(f"Adding categories filter: {categories}")
# logger.info(f"Adding categories filter: {categories}")
for category in categories:
filter_params += f"&epreuve[{category}]={category}"
# Add levels filter
if levels:
logger.info(f"Adding levels filter: {levels}")
# logger.info(f"Adding levels filter: {levels}")
for level in levels:
filter_params += f"&categorie_tournoi[{level}]={level}"
# Add ages filter
if ages:
logger.info(f"Adding ages filter: {ages}")
# logger.info(f"Adding ages filter: {ages}")
for age in ages:
filter_params += f"&categorie_age[{age}]={age}"
# Add types filter
if tournament_types:
logger.info(f"Adding types filter: {tournament_types}")
# logger.info(f"Adding types filter: {tournament_types}")
for t_type in tournament_types:
capitalized_type = t_type.capitalize()
filter_params += f"&type[{capitalized_type}]={capitalized_type}"
# Add national cup filter
if national_cup:
logger.info("Adding national cup filter")
# logger.info("Adding national cup filter")
filter_params += "&tournoi_npc=1"
# Fix the sorting parameter
@ -477,7 +479,7 @@ def scrape_fft_all_tournaments(
# Default to ligue search if no city provided
params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
logger.info(f"AJAX Parameters: {params}")
# logger.info(f"AJAX Parameters: {params}")
# Make AJAX request and capture the full response - EXACT same as club function
ajax_script = f"""
@ -512,23 +514,23 @@ def scrape_fft_all_tournaments(
}}
"""
logger.info("Making AJAX request...")
# logger.info("Making AJAX request...")
result = page_obj.evaluate(ajax_script)
browser.close()
# Print the full response for debugging - EXACT same as club function
logger.info(f"AJAX Response Status: {result.get('status')}")
logger.info(f"AJAX Response Success: {result.get('success')}")
# logger.info(f"AJAX Response Status: {result.get('status')}")
# logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get("success"):
response_text = result.get("responseText", "")
logger.info(f"Raw Response Length: {len(response_text)}")
# logger.info(f"Raw Response Length: {len(response_text)}")
try:
# Try to parse as JSON
json_data = json.loads(response_text)
logger.info(f"JSON Response Type: {type(json_data)}")
# logger.info(f"JSON Response Type: {type(json_data)}")
# Now try to parse it - EXACT same as club function
parsed_result = _parse_ajax_response(json_data)
@ -536,11 +538,11 @@ def scrape_fft_all_tournaments(
return parsed_result
except json.JSONDecodeError as json_error:
logger.error(f"JSON Parse Error: {json_error}")
logger.error(f"Response text: {response_text}")
# logger.error(f"JSON Parse Error: {json_error}")
# logger.error(f"Response text: {response_text}")
return None
else:
logger.error(f"AJAX request failed: {result.get('error')}")
# logger.error(f"AJAX request failed: {result.get('error')}")
return None
except Exception as e:
@ -551,73 +553,277 @@ def scrape_fft_all_tournaments(
def get_umpire_data(tournament_id):
"""
Fast umpire data extraction using Playwright (optimized for speed)
Umpire data extraction with improved Queue-It handling
"""
logger.info(f"Getting umpire data for tournament {tournament_id}")
# logger.info(f"Getting umpire data for tournament {tournament_id}")
try:
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=[
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-images", # Don't load images
"--disable-javascript", # Disable JS for faster loading
"--disable-plugins",
"--disable-extensions",
],
)
browser = get_browser_for_environment(p)
page = browser.new_page()
# Navigate to tournament page quickly
# Navigate to tournament page
url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
logger.info(f"Navigating to tournament page: {url}")
# logger.info(f"Navigating to tournament page: {url}")
try:
# Fast navigation - don't wait for everything to load
page.goto(url, timeout=15000, wait_until="domcontentloaded")
# Navigate with reasonable timeout
page.goto(url, timeout=30000, wait_until="domcontentloaded")
# Quick Queue-It check
# Enhanced Queue-It handling (similar to tournament search)
if "queue-it.net" in page.url.lower():
logger.warning("Hit Queue-It on tournament page")
browser.close()
return None, None, None
# Extract data using the fastest method - regex on HTML content
# logger.warning(f"Queue-It detected for tournament {tournament_id}")
# Wait strategy based on environment
max_queue_wait = (
120000 if not settings.DEBUG else 30000
) # 2 min prod, 30s dev
check_interval = 1000 # 10 seconds
elapsed_time = 0
# logger.info(
# f"Waiting in queue for umpire data (max {max_queue_wait / 1000}s)..."
# )
while (
elapsed_time < max_queue_wait
and "queue-it.net" in page.url.lower()
):
# logger.info(
# f"Umpire queue wait: {elapsed_time / 1000}s elapsed"
# )
page.wait_for_timeout(check_interval)
elapsed_time += check_interval
# Check if we've been redirected
if "queue-it.net" not in page.url.lower():
# logger.info(
# "Successfully passed through queue for umpire data!"
# )
break
# If still in queue, try fallback or return None
if "queue-it.net" in page.url.lower():
# logger.warning(
# "Umpire queue timeout - trying requests fallback"
# )
browser.close()
return _get_umpire_data_requests_fallback(tournament_id)
# Wait for page to load properly
page.wait_for_load_state("networkidle", timeout=20000)
# Extract data using multiple strategies
html_content = page.content()
# Extract name
name_pattern = (
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
)
name_match = re.search(name_pattern, html_content)
name = name_match.group(1).strip() if name_match else None
# Extract email
email_pattern = r'mailto:([^"]+)"'
email_match = re.search(email_pattern, html_content)
email = email_match.group(1) if email_match else None
# Strategy 1: Try existing regex patterns
name, email, phone = _extract_umpire_with_regex(html_content)
# Extract phone
phone_pattern = r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>'
phone_match = re.search(phone_pattern, html_content)
phone = phone_match.group(1).strip() if phone_match else None
# Strategy 2: If regex fails, try DOM selectors
if not name and not email and not phone:
name, email, phone = _extract_umpire_with_selectors(page)
browser.close()
logger.info(
f"Extracted umpire data: name={name}, email={email}, phone={phone}"
)
return name, email, phone
if name or email or phone:
# logger.info(
# f"Successfully extracted umpire data: name={name}, email={email}, phone={phone}"
# )
return name, email, phone
else:
logger.warning(
f"No umpire data found for tournament {tournament_id}"
)
return None, None, None
except Exception as page_error:
logger.error(f"Error loading tournament page: {page_error}")
logger.error(
f"Error loading tournament page {tournament_id}: {page_error}"
)
browser.close()
return None, None, None
# Try requests fallback
logger.info("Trying requests fallback after Playwright error")
return _get_umpire_data_requests_fallback(tournament_id)
except Exception as e:
logger.error(f"Error in umpire data extraction for {tournament_id}: {e}")
return None, None, None
def _extract_umpire_with_regex(html_content):
"""
Extract umpire data using regex patterns
"""
# Extract name
name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
name_match = re.search(name_pattern, html_content)
name = name_match.group(1).strip() if name_match else None
# Extract email
email_pattern = r'mailto:([^"]+)"'
email_match = re.search(email_pattern, html_content)
email = email_match.group(1) if email_match else None
# Extract phone - try multiple patterns
phone_patterns = [
r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>',
r"(\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{2})",
r"(\d{10})",
r"(\+33\s?\d{1}\s?\d{2}\s?\d{2}\s?\d{2}\s?\d{2})",
]
phone = None
for pattern in phone_patterns:
phone_match = re.search(pattern, html_content)
if phone_match:
phone = phone_match.group(1).strip()
break
return name, email, phone
def _extract_umpire_with_selectors(page):
"""
Extract umpire data using DOM selectors as fallback
"""
name = None
email = None
phone = None
try:
# Try different selectors for name
name_selectors = [
".tournoi-detail-page-inscription-responsable-title",
'[class*="responsable-title"]',
'[class*="umpire-name"]',
'h3:has-text("Responsable")',
]
for selector in name_selectors:
try:
element = page.query_selector(selector)
if element:
name = element.inner_text().strip()
if name:
break
except:
continue
# Try different selectors for email
email_selectors = [
'a[href^="mailto:"]',
'[class*="email"]',
]
for selector in email_selectors:
try:
element = page.query_selector(selector)
if element:
href = element.get_attribute("href")
if href and href.startswith("mailto:"):
email = href.replace("mailto:", "")
break
text = element.inner_text().strip()
if "@" in text:
email = text
break
except:
continue
# Try different selectors for phone
phone_selectors = [
".details-bloc",
'[class*="phone"]',
'[class*="telephone"]',
]
for selector in phone_selectors:
try:
element = page.query_selector(selector)
if element:
text = element.inner_text().strip()
# Check if text looks like a phone number
if re.match(r"[\d\s\.\+\-\(\)]{8,}", text):
phone = text
break
except:
continue
except Exception as e:
logger.warning(f"Error in selector-based extraction: {e}")
return name, email, phone
def _get_umpire_data_requests_fallback(tournament_id):
"""
Enhanced fallback method using requests
"""
logger.info(f"Using requests fallback for tournament {tournament_id}")
try:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
response = requests.get(url, headers=headers, timeout=15)
if "queue-it.net" in response.url:
# logger.warning("Requests fallback also hit Queue-It")
return None, None, None
if response.status_code != 200:
# logger.error(f"Requests fallback failed: {response.status_code}")
return None, None, None
# Try regex extraction on requests response
name, email, phone = _extract_umpire_with_regex(response.text)
# If regex fails, try BeautifulSoup
if not name and not email and not phone:
try:
soup = BeautifulSoup(response.text, "html.parser")
# Try to find name
name_elements = soup.find_all(
class_=lambda x: x and "responsable" in x.lower()
)
if name_elements:
name = name_elements[0].get_text().strip()
# Try to find email
email_links = soup.find_all(
"a", href=lambda x: x and x.startswith("mailto:")
)
if email_links:
email = email_links[0]["href"].replace("mailto:", "")
# Try to find phone in various elements
for element in soup.find_all(text=True):
if re.search(
r"\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}",
str(element),
):
phone = str(element).strip()
break
except Exception as soup_error:
logger.warning(f"BeautifulSoup parsing failed: {soup_error}")
logger.info(
f"Requests fallback result: name={name}, email={email}, phone={phone}"
)
return name, email, phone
except Exception as e:
logger.error(f"Error in umpire data extraction: {e}")
logger.error(f"Requests fallback error: {e}")
return None, None, None
@ -715,7 +921,7 @@ def scrape_fft_all_tournaments_concurrent(
ages=None,
tournament_types=None,
national_cup=False,
max_workers=5,
max_workers=10,
):
"""
Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end)
@ -777,8 +983,8 @@ def scrape_fft_all_tournaments_concurrent(
# Scrape all remaining pages concurrently (pages 1 to total_pages-1)
all_tournaments = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
max_concurrent = min(total_pages, 20)
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
futures = []
for page in range(1, total_pages):
@ -834,7 +1040,7 @@ def _parse_clubs_ajax_response(json_data):
"""
try:
# Log the raw response structure to understand what we're getting
logger.info(f"Raw clubs response structure: {json_data}")
# logger.info(f"Raw clubs response structure: {json_data}")
club_markers = []
total_results = 0
@ -937,7 +1143,7 @@ def scrape_federal_clubs(
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
browser = get_browser_for_environment(p)
page_obj = browser.new_page()
page_obj.set_extra_http_headers(
@ -949,8 +1155,6 @@ def scrape_federal_clubs(
# Clean up city name - remove zip code and extra info
clean_city = city
if city:
import re
clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip()
clean_city = clean_city.rstrip(",").strip()
@ -966,7 +1170,7 @@ def scrape_federal_clubs(
page_obj.goto(results_url)
# Wait for the page to load
page_obj.wait_for_timeout(3000)
page_obj.wait_for_timeout(2000)
# Check if we're in queue
if "queue-it.net" in page_obj.url.lower():
@ -1056,3 +1260,32 @@ def scrape_federal_clubs(
logger.error(f"Error in federal clubs scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}")
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
def get_browser_for_environment(playwright_instance):
"""
Get appropriate browser based on environment
- Development (DEBUG=True): Use Firefox (works better on macOS 15+)
- Production (DEBUG=False): Use Chromium (as it was working)
"""
is_development = getattr(settings, "DEBUG", False)
if is_development:
# logger.info("Development environment detected - using Firefox")
try:
return playwright_instance.firefox.launch(
headless=True, args=["--no-sandbox"]
)
except Exception as firefox_error:
logger.warning(
f"Firefox failed in dev, falling back to Chromium: {firefox_error}"
)
return playwright_instance.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-dev-shm-usage", "--single-process"],
)
else:
logger.info("Production environment detected - using Chromium")
return playwright_instance.chromium.launch(
headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"]
)

Loading…
Cancel
Save