From 2c47025a771701b1422170ed927a3621a7e3db2e Mon Sep 17 00:00:00 2001 From: Razmig Sarkissian Date: Thu, 25 Sep 2025 18:52:06 +0200 Subject: [PATCH] Refactor Playwright scraping with environment-specific browser and Queue-It handling --- api/utils.py | 415 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 324 insertions(+), 91 deletions(-) diff --git a/api/utils.py b/api/utils.py index 50ae798..a79db4e 100644 --- a/api/utils.py +++ b/api/utils.py @@ -6,6 +6,8 @@ from datetime import datetime, timedelta import json import traceback from concurrent.futures import ThreadPoolExecutor, as_completed +from django.conf import settings +from bs4 import BeautifulSoup logger = logging.getLogger(__name__) @@ -32,7 +34,7 @@ def scrape_fft_club_tournaments( logger.info(f"Starting Playwright scraping for {club_name}") try: with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = get_browser_for_environment(p) page_obj = browser.new_page() page_obj.set_extra_http_headers( @@ -43,10 +45,10 @@ def scrape_fft_club_tournaments( # Navigate to FFT target_url = "https://tenup.fft.fr/recherche/tournois" - logger.info(f"Navigating to: {target_url}") + # logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) - # page_obj.wait_for_timeout(7000) + page_obj.wait_for_timeout(2000) current_url = page_obj.url logger.info(f"Current URL: {current_url}") @@ -64,7 +66,7 @@ def scrape_fft_club_tournaments( return None form_build_id = form_input.get_attribute("value") - logger.info(f"Extracted form_build_id: {form_build_id}") + # logger.info(f"Extracted form_build_id: {form_build_id}") # Build parameters date_component = "" @@ -83,7 +85,7 @@ def scrape_fft_club_tournaments( params = f"recherche_type=club&club[autocomplete][value_container][value_field]={club_code_clean}&club[autocomplete][value_container][label_field]={club_name_encoded}&pratique=PADEL{date_component}&page={page}&sort=dateDebut+asc&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" - logger.info(f"AJAX Parameters: {params}") + # logger.info(f"AJAX Parameters: {params}") # Make AJAX request and capture the full response ajax_script = f""" @@ -118,24 +120,24 @@ def scrape_fft_club_tournaments( }} """ - logger.info("Making AJAX request...") + # logger.info("Making AJAX request...") result = page_obj.evaluate(ajax_script) browser.close() # Print the full response for debugging - logger.info(f"AJAX Response Status: {result.get('status')}") - logger.info(f"AJAX Response Success: {result.get('success')}") + # logger.info(f"AJAX Response Status: {result.get('status')}") + # logger.info(f"AJAX Response Success: {result.get('success')}") if result.get("success"): response_text = result.get("responseText", "") - logger.info(f"Raw Response Length: {len(response_text)}") + # logger.info(f"Raw Response Length: {len(response_text)}") # logger.info(f"Raw Response (first 500 chars): {response_text[:500]}") try: # Try to parse as JSON json_data = json.loads(response_text) - logger.info(f"JSON Response Type: {type(json_data)}") + # logger.info(f"JSON Response Type: {type(json_data)}") # logger.info(f"JSON Response: {json.dumps(json_data, indent=2, default=str)}") # Now try to parse it @@ -164,7 +166,7 @@ def scrape_fft_club_tournaments_all_pages( """ Scrapes all pages of FFT tournaments for a specific club """ - logger.info(f"Starting complete tournament scraping for {club_name}") + # logger.info(f"Starting complete tournament scraping for {club_name}") all_tournaments = [] page = 0 @@ -237,7 +239,7 @@ def _parse_ajax_response(commands): # Check for alert commands (maintenance mode) for command in commands: if command.get("command") == "alert": - logger.warning("Maintenance mode detected") + # logger.warning("Maintenance mode detected") return None # Find the command with results @@ -245,7 +247,7 @@ def _parse_ajax_response(commands): for command in commands: if command.get("command") == "recherche_tournois_update": result_command = command - logger.info("Found recherche_tournois_update command!") + # logger.info("Found recherche_tournois_update command!") break if result_command and result_command.get("results"): @@ -253,7 +255,7 @@ def _parse_ajax_response(commands): items = results.get("items", []) total_results = results.get("nb_results", 0) - logger.info(f"Processing {len(items)} tournaments from results") + # logger.info(f"Processing {len(items)} tournaments from results") for item in items: # Parse dates - they're already in the correct format @@ -331,9 +333,9 @@ def _parse_ajax_response(commands): tournaments.append(tournament) - logger.info( - f"Successfully parsed {len(tournaments)} tournaments from response" - ) + # logger.info( + # f"Successfully parsed {len(tournaments)} tournaments from response" + # ) return { "tournaments": tournaments, "total_results": total_results, @@ -371,7 +373,7 @@ def scrape_fft_all_tournaments( try: with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = get_browser_for_environment(p) page_obj = browser.new_page() page_obj.set_extra_http_headers( @@ -382,28 +384,28 @@ def scrape_fft_all_tournaments( # Navigate to FFT target_url = "https://tenup.fft.fr/recherche/tournois" - logger.info(f"Navigating to: {target_url}") + # logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) - # page_obj.wait_for_timeout(7000) + page_obj.wait_for_timeout(500) current_url = page_obj.url - logger.info(f"Current URL: {current_url}") + # logger.info(f"Current URL: {current_url}") if "queue-it.net" in current_url.lower(): - logger.warning("Still in Queue-It") + # logger.warning("Still in Queue-It") browser.close() return None # Extract form_build_id form_input = page_obj.query_selector('input[name="form_build_id"]') if not form_input: - logger.error("Could not find form_build_id") + # logger.error("Could not find form_build_id") browser.close() return None form_build_id = form_input.get_attribute("value") - logger.info(f"Extracted form_build_id: {form_build_id}") + # logger.info(f"Extracted form_build_id: {form_build_id}") # Build parameters - EXACT same pattern as club function date_component = "" @@ -422,32 +424,32 @@ def scrape_fft_all_tournaments( # Add categories filter if categories: - logger.info(f"Adding categories filter: {categories}") + # logger.info(f"Adding categories filter: {categories}") for category in categories: filter_params += f"&epreuve[{category}]={category}" # Add levels filter if levels: - logger.info(f"Adding levels filter: {levels}") + # logger.info(f"Adding levels filter: {levels}") for level in levels: filter_params += f"&categorie_tournoi[{level}]={level}" # Add ages filter if ages: - logger.info(f"Adding ages filter: {ages}") + # logger.info(f"Adding ages filter: {ages}") for age in ages: filter_params += f"&categorie_age[{age}]={age}" # Add types filter if tournament_types: - logger.info(f"Adding types filter: {tournament_types}") + # logger.info(f"Adding types filter: {tournament_types}") for t_type in tournament_types: capitalized_type = t_type.capitalize() filter_params += f"&type[{capitalized_type}]={capitalized_type}" # Add national cup filter if national_cup: - logger.info("Adding national cup filter") + # logger.info("Adding national cup filter") filter_params += "&tournoi_npc=1" # Fix the sorting parameter @@ -477,7 +479,7 @@ def scrape_fft_all_tournaments( # Default to ligue search if no city provided params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" - logger.info(f"AJAX Parameters: {params}") + # logger.info(f"AJAX Parameters: {params}") # Make AJAX request and capture the full response - EXACT same as club function ajax_script = f""" @@ -512,23 +514,23 @@ def scrape_fft_all_tournaments( }} """ - logger.info("Making AJAX request...") + # logger.info("Making AJAX request...") result = page_obj.evaluate(ajax_script) browser.close() # Print the full response for debugging - EXACT same as club function - logger.info(f"AJAX Response Status: {result.get('status')}") - logger.info(f"AJAX Response Success: {result.get('success')}") + # logger.info(f"AJAX Response Status: {result.get('status')}") + # logger.info(f"AJAX Response Success: {result.get('success')}") if result.get("success"): response_text = result.get("responseText", "") - logger.info(f"Raw Response Length: {len(response_text)}") + # logger.info(f"Raw Response Length: {len(response_text)}") try: # Try to parse as JSON json_data = json.loads(response_text) - logger.info(f"JSON Response Type: {type(json_data)}") + # logger.info(f"JSON Response Type: {type(json_data)}") # Now try to parse it - EXACT same as club function parsed_result = _parse_ajax_response(json_data) @@ -536,11 +538,11 @@ def scrape_fft_all_tournaments( return parsed_result except json.JSONDecodeError as json_error: - logger.error(f"JSON Parse Error: {json_error}") - logger.error(f"Response text: {response_text}") + # logger.error(f"JSON Parse Error: {json_error}") + # logger.error(f"Response text: {response_text}") return None else: - logger.error(f"AJAX request failed: {result.get('error')}") + # logger.error(f"AJAX request failed: {result.get('error')}") return None except Exception as e: @@ -551,73 +553,277 @@ def scrape_fft_all_tournaments( def get_umpire_data(tournament_id): """ - Fast umpire data extraction using Playwright (optimized for speed) + Umpire data extraction with improved Queue-It handling """ - logger.info(f"Getting umpire data for tournament {tournament_id}") + # logger.info(f"Getting umpire data for tournament {tournament_id}") try: with sync_playwright() as p: - browser = p.chromium.launch( - headless=True, - args=[ - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-images", # Don't load images - "--disable-javascript", # Disable JS for faster loading - "--disable-plugins", - "--disable-extensions", - ], - ) + browser = get_browser_for_environment(p) page = browser.new_page() - # Navigate to tournament page quickly + # Navigate to tournament page url = f"https://tenup.fft.fr/tournoi/{tournament_id}" - logger.info(f"Navigating to tournament page: {url}") + # logger.info(f"Navigating to tournament page: {url}") try: - # Fast navigation - don't wait for everything to load - page.goto(url, timeout=15000, wait_until="domcontentloaded") + # Navigate with reasonable timeout + page.goto(url, timeout=30000, wait_until="domcontentloaded") - # Quick Queue-It check + # Enhanced Queue-It handling (similar to tournament search) if "queue-it.net" in page.url.lower(): - logger.warning("Hit Queue-It on tournament page") - browser.close() - return None, None, None - - # Extract data using the fastest method - regex on HTML content + # logger.warning(f"Queue-It detected for tournament {tournament_id}") + + # Wait strategy based on environment + max_queue_wait = ( + 120000 if not settings.DEBUG else 30000 + ) # 2 min prod, 30s dev + check_interval = 1000 # 10 seconds + elapsed_time = 0 + + # logger.info( + # f"Waiting in queue for umpire data (max {max_queue_wait / 1000}s)..." + # ) + + while ( + elapsed_time < max_queue_wait + and "queue-it.net" in page.url.lower() + ): + # logger.info( + # f"Umpire queue wait: {elapsed_time / 1000}s elapsed" + # ) + page.wait_for_timeout(check_interval) + elapsed_time += check_interval + + # Check if we've been redirected + if "queue-it.net" not in page.url.lower(): + # logger.info( + # "Successfully passed through queue for umpire data!" + # ) + break + + # If still in queue, try fallback or return None + if "queue-it.net" in page.url.lower(): + # logger.warning( + # "Umpire queue timeout - trying requests fallback" + # ) + browser.close() + return _get_umpire_data_requests_fallback(tournament_id) + + # Wait for page to load properly + page.wait_for_load_state("networkidle", timeout=20000) + + # Extract data using multiple strategies html_content = page.content() - # Extract name - name_pattern = ( - r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' - ) - name_match = re.search(name_pattern, html_content) - name = name_match.group(1).strip() if name_match else None - - # Extract email - email_pattern = r'mailto:([^"]+)"' - email_match = re.search(email_pattern, html_content) - email = email_match.group(1) if email_match else None + # Strategy 1: Try existing regex patterns + name, email, phone = _extract_umpire_with_regex(html_content) - # Extract phone - phone_pattern = r'
\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*
' - phone_match = re.search(phone_pattern, html_content) - phone = phone_match.group(1).strip() if phone_match else None + # Strategy 2: If regex fails, try DOM selectors + if not name and not email and not phone: + name, email, phone = _extract_umpire_with_selectors(page) browser.close() - logger.info( - f"Extracted umpire data: name={name}, email={email}, phone={phone}" - ) - return name, email, phone + if name or email or phone: + # logger.info( + # f"Successfully extracted umpire data: name={name}, email={email}, phone={phone}" + # ) + return name, email, phone + else: + logger.warning( + f"No umpire data found for tournament {tournament_id}" + ) + return None, None, None except Exception as page_error: - logger.error(f"Error loading tournament page: {page_error}") + logger.error( + f"Error loading tournament page {tournament_id}: {page_error}" + ) browser.close() - return None, None, None + + # Try requests fallback + logger.info("Trying requests fallback after Playwright error") + return _get_umpire_data_requests_fallback(tournament_id) + + except Exception as e: + logger.error(f"Error in umpire data extraction for {tournament_id}: {e}") + return None, None, None + + +def _extract_umpire_with_regex(html_content): + """ + Extract umpire data using regex patterns + """ + # Extract name + name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + name_match = re.search(name_pattern, html_content) + name = name_match.group(1).strip() if name_match else None + + # Extract email + email_pattern = r'mailto:([^"]+)"' + email_match = re.search(email_pattern, html_content) + email = email_match.group(1) if email_match else None + + # Extract phone - try multiple patterns + phone_patterns = [ + r'
\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*
', + r"(\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{2})", + r"(\d{10})", + r"(\+33\s?\d{1}\s?\d{2}\s?\d{2}\s?\d{2}\s?\d{2})", + ] + + phone = None + for pattern in phone_patterns: + phone_match = re.search(pattern, html_content) + if phone_match: + phone = phone_match.group(1).strip() + break + + return name, email, phone + + +def _extract_umpire_with_selectors(page): + """ + Extract umpire data using DOM selectors as fallback + """ + name = None + email = None + phone = None + + try: + # Try different selectors for name + name_selectors = [ + ".tournoi-detail-page-inscription-responsable-title", + '[class*="responsable-title"]', + '[class*="umpire-name"]', + 'h3:has-text("Responsable")', + ] + + for selector in name_selectors: + try: + element = page.query_selector(selector) + if element: + name = element.inner_text().strip() + if name: + break + except: + continue + + # Try different selectors for email + email_selectors = [ + 'a[href^="mailto:"]', + '[class*="email"]', + ] + + for selector in email_selectors: + try: + element = page.query_selector(selector) + if element: + href = element.get_attribute("href") + if href and href.startswith("mailto:"): + email = href.replace("mailto:", "") + break + text = element.inner_text().strip() + if "@" in text: + email = text + break + except: + continue + + # Try different selectors for phone + phone_selectors = [ + ".details-bloc", + '[class*="phone"]', + '[class*="telephone"]', + ] + + for selector in phone_selectors: + try: + element = page.query_selector(selector) + if element: + text = element.inner_text().strip() + # Check if text looks like a phone number + + if re.match(r"[\d\s\.\+\-\(\)]{8,}", text): + phone = text + break + except: + continue + + except Exception as e: + logger.warning(f"Error in selector-based extraction: {e}") + + return name, email, phone + + +def _get_umpire_data_requests_fallback(tournament_id): + """ + Enhanced fallback method using requests + """ + logger.info(f"Using requests fallback for tournament {tournament_id}") + + try: + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + } + + url = f"https://tenup.fft.fr/tournoi/{tournament_id}" + response = requests.get(url, headers=headers, timeout=15) + + if "queue-it.net" in response.url: + # logger.warning("Requests fallback also hit Queue-It") + return None, None, None + + if response.status_code != 200: + # logger.error(f"Requests fallback failed: {response.status_code}") + return None, None, None + + # Try regex extraction on requests response + name, email, phone = _extract_umpire_with_regex(response.text) + + # If regex fails, try BeautifulSoup + if not name and not email and not phone: + try: + soup = BeautifulSoup(response.text, "html.parser") + + # Try to find name + name_elements = soup.find_all( + class_=lambda x: x and "responsable" in x.lower() + ) + if name_elements: + name = name_elements[0].get_text().strip() + + # Try to find email + email_links = soup.find_all( + "a", href=lambda x: x and x.startswith("mailto:") + ) + if email_links: + email = email_links[0]["href"].replace("mailto:", "") + + # Try to find phone in various elements + for element in soup.find_all(text=True): + if re.search( + r"\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}", + str(element), + ): + phone = str(element).strip() + break + + except Exception as soup_error: + logger.warning(f"BeautifulSoup parsing failed: {soup_error}") + + logger.info( + f"Requests fallback result: name={name}, email={email}, phone={phone}" + ) + return name, email, phone except Exception as e: - logger.error(f"Error in umpire data extraction: {e}") + logger.error(f"Requests fallback error: {e}") return None, None, None @@ -715,7 +921,7 @@ def scrape_fft_all_tournaments_concurrent( ages=None, tournament_types=None, national_cup=False, - max_workers=5, + max_workers=10, ): """ Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end) @@ -777,8 +983,8 @@ def scrape_fft_all_tournaments_concurrent( # Scrape all remaining pages concurrently (pages 1 to total_pages-1) all_tournaments = [] - - with ThreadPoolExecutor(max_workers=max_workers) as executor: + max_concurrent = min(total_pages, 20) + with ThreadPoolExecutor(max_workers=max_concurrent) as executor: futures = [] for page in range(1, total_pages): @@ -834,7 +1040,7 @@ def _parse_clubs_ajax_response(json_data): """ try: # Log the raw response structure to understand what we're getting - logger.info(f"Raw clubs response structure: {json_data}") + # logger.info(f"Raw clubs response structure: {json_data}") club_markers = [] total_results = 0 @@ -937,7 +1143,7 @@ def scrape_federal_clubs( try: with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = get_browser_for_environment(p) page_obj = browser.new_page() page_obj.set_extra_http_headers( @@ -949,8 +1155,6 @@ def scrape_federal_clubs( # Clean up city name - remove zip code and extra info clean_city = city if city: - import re - clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip() clean_city = clean_city.rstrip(",").strip() @@ -966,7 +1170,7 @@ def scrape_federal_clubs( page_obj.goto(results_url) # Wait for the page to load - page_obj.wait_for_timeout(3000) + page_obj.wait_for_timeout(2000) # Check if we're in queue if "queue-it.net" in page_obj.url.lower(): @@ -1056,3 +1260,32 @@ def scrape_federal_clubs( logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} + + +def get_browser_for_environment(playwright_instance): + """ + Get appropriate browser based on environment + - Development (DEBUG=True): Use Firefox (works better on macOS 15+) + - Production (DEBUG=False): Use Chromium (as it was working) + """ + is_development = getattr(settings, "DEBUG", False) + + if is_development: + # logger.info("Development environment detected - using Firefox") + try: + return playwright_instance.firefox.launch( + headless=True, args=["--no-sandbox"] + ) + except Exception as firefox_error: + logger.warning( + f"Firefox failed in dev, falling back to Chromium: {firefox_error}" + ) + return playwright_instance.chromium.launch( + headless=True, + args=["--no-sandbox", "--disable-dev-shm-usage", "--single-process"], + ) + else: + logger.info("Production environment detected - using Chromium") + return playwright_instance.chromium.launch( + headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"] + )