import logging import requests import re from playwright.sync_api import sync_playwright from datetime import datetime, timedelta import json import traceback from concurrent.futures import ThreadPoolExecutor, as_completed from django.conf import settings logger = logging.getLogger(__name__) def check_version_smaller_than_1_1_12(version_str): # Remove the parentheses part if it exists, example of version: 1.1.12 (2) version_str = version_str.split()[0] if version_str: # Split version into components version_parts = [int(x) for x in version_str.split(".")] target_parts = [1, 1, 12] # Compare version components return version_parts < target_parts else: return False def scrape_fft_club_tournaments( club_code, club_name, start_date=None, end_date=None, page=0 ): """ Scrapes FFT tournaments using Playwright with detailed debugging """ # logger.info(f"Starting Playwright scraping for {club_name}") try: with sync_playwright() as p: browser = get_browser_for_environment(p) page_obj = browser.new_page() page_obj.set_extra_http_headers( { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" } ) # Navigate to FFT target_url = "https://tenup.fft.fr/recherche/tournois" # # logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) page_obj.wait_for_timeout(2000) current_url = page_obj.url # logger.info(f"Current URL: {current_url}") if "queue-it.net" in current_url.lower(): logger.warning("Still in Queue-It") browser.close() return None # Extract form_build_id form_input = page_obj.query_selector('input[name="form_build_id"]') if not form_input: logger.error("Could not find form_build_id") browser.close() return None form_build_id = form_input.get_attribute("value") # # logger.info(f"Extracted form_build_id: {form_build_id}") # Build parameters date_component = "" if start_date and end_date: date_component = f"&date[start]={start_date}&date[end]={end_date}" elif start_date: try: start_dt = datetime.strptime(start_date, "%d/%m/%y") end_dt = start_dt + timedelta(days=90) date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}" except ValueError: logger.warning(f"Invalid date format: {start_date}") club_name_encoded = club_name.replace(" ", "+") club_code_clean = club_code.replace(" ", "") params = f"recherche_type=club&club[autocomplete][value_container][value_field]={club_code_clean}&club[autocomplete][value_container][label_field]={club_name_encoded}&pratique=PADEL{date_component}&page={page}&sort=dateDebut+asc&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" # # logger.info(f"AJAX Parameters: {params}") # Make AJAX request and capture the full response ajax_script = f""" async () => {{ try {{ const response = await fetch('https://tenup.fft.fr/system/ajax', {{ method: 'POST', headers: {{ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://tenup.fft.fr', 'Referer': 'https://tenup.fft.fr/recherche/tournois' }}, body: `{params}` }}); const status = response.status; const responseText = await response.text(); return {{ success: response.ok, status: status, responseText: responseText }}; }} catch (error) {{ return {{ success: false, error: error.message }}; }} }} """ # # logger.info("Making AJAX request...") result = page_obj.evaluate(ajax_script) browser.close() # Print the full response for debugging # # logger.info(f"AJAX Response Status: {result.get('status')}") # # logger.info(f"AJAX Response Success: {result.get('success')}") if result.get("success"): response_text = result.get("responseText", "") # # logger.info(f"Raw Response Length: {len(response_text)}") # # logger.info(f"Raw Response (first 500 chars): {response_text[:500]}") try: # Try to parse as JSON json_data = json.loads(response_text) # # logger.info(f"JSON Response Type: {type(json_data)}") # # logger.info(f"JSON Response: {json.dumps(json_data, indent=2, default=str)}") # Now try to parse it parsed_result = _parse_ajax_response(json_data) # # logger.info(f"Parsed Result: {parsed_result}") return parsed_result except json.JSONDecodeError as json_error: logger.error(f"JSON Parse Error: {json_error}") logger.error(f"Response text: {response_text}") return None else: logger.error(f"AJAX request failed: {result.get('error')}") return None except Exception as e: logger.error(f"Error in Playwright scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") return None def scrape_fft_club_tournaments_all_pages( club_code, club_name, start_date=None, end_date=None ): """ Scrapes all pages of FFT tournaments for a specific club """ # # logger.info(f"Starting complete tournament scraping for {club_name}") all_tournaments = [] page = 0 while True: try: # Call the working single-page function result = scrape_fft_club_tournaments( club_code=club_code, club_name=club_name, start_date=start_date, end_date=end_date, page=page, ) # Debug: Log what we got # logger.info(f"Page {page} result: {result}") if not result: logger.warning(f"No result for page {page}") break tournaments = result.get("tournaments", []) # logger.info(f"Page {page} returned {len(tournaments)} tournaments") if not tournaments: # logger.info(f"No tournaments on page {page}, stopping") break all_tournaments.extend(tournaments) # Check if we have all results total_results = result.get("total_results", 0) # logger.info( # f"Total so far: {len(all_tournaments)}, Target: {total_results}" # ) if len(all_tournaments) >= total_results: # logger.info("Got all tournaments, stopping") break page += 1 # logger.info(f"Moving to page {page}") # time.sleep(1) # Rate limiting except Exception as e: logger.error(f"Error on page {page}: {e}") break # logger.info( # f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages" # ) return { "tournaments": all_tournaments, "total_results": len(all_tournaments), "current_count": len(all_tournaments), "pages_scraped": page + 1, } def _parse_ajax_response(commands): """ Parse the AJAX response commands to extract tournament data Returns data in the exact format expected by Swift FederalTournament struct """ tournaments = [] try: # Check for alert commands (maintenance mode) for command in commands: if command.get("command") == "alert": # logger.warning("Maintenance mode detected") return None # Find the command with results result_command = None for command in commands: if command.get("command") == "recherche_tournois_update": result_command = command # # logger.info("Found recherche_tournois_update command!") break if result_command and result_command.get("results"): results = result_command["results"] items = results.get("items", []) total_results = results.get("nb_results", 0) # # logger.info(f"Processing {len(items)} tournaments from results") for item in items: # Parse dates - they're already in the correct format date_debut = item.get("dateDebut") date_fin = item.get("dateFin") date_validation = item.get("dateValidation") # Build the tournament object to match Swift FederalTournament structure tournament = { "id": str(item.get("id", "")), "millesime": item.get("millesime"), "libelle": item.get("libelle"), "tmc": item.get("tmc"), "tarifAdulteChampionnat": item.get("tarifAdulteChampionnat"), "type": item.get("type"), "ageReel": item.get("ageReel"), "naturesTerrains": item.get("naturesTerrains", []), "idsArbitres": item.get("idsArbitres", []), "tarifJeuneChampionnat": item.get("tarifJeuneChampionnat"), "international": item.get("international"), "inscriptionEnLigne": item.get("inscriptionEnLigne"), "categorieTournoi": item.get("categorieTournoi"), "prixLot": item.get("prixLot"), "paiementEnLigne": item.get("paiementEnLigne"), "reductionAdherentJeune": item.get("reductionAdherentJeune"), "reductionAdherentAdulte": item.get("reductionAdherentAdulte"), "paiementEnLigneObligatoire": item.get( "paiementEnLigneObligatoire" ), "villeEngagement": item.get("villeEngagement"), "senior": item.get("senior"), "veteran": item.get("veteran"), "inscriptionEnLigneEnCours": item.get("inscriptionEnLigneEnCours"), "avecResultatPublie": item.get("avecResultatPublie"), "code": item.get("code"), "categorieAge": item.get("categorieAge"), "codeComite": item.get("codeComite"), "installations": item.get("installations", []), "reductionEpreuveSupplementaireJeune": item.get( "reductionEpreuveSupplementaireJeune" ), "reductionEpreuveSupplementaireAdulte": item.get( "reductionEpreuveSupplementaireAdulte" ), "nomComite": item.get("nomComite"), "naturesEpreuves": item.get("naturesEpreuves"), "jeune": item.get("jeune"), "courrielEngagement": item.get("courrielEngagement"), "nomClub": item.get("nomClub"), "installation": item.get("installation"), "categorieAgeMax": item.get("categorieAgeMax"), "tournoiInterne": item.get("tournoiInterne"), "nomLigue": item.get("nomLigue"), "nomEngagement": item.get("nomEngagement"), "codeLigue": item.get("codeLigue"), "modeleDeBalle": item.get("modeleDeBalle"), "jugeArbitre": item.get("jugeArbitre"), "adresse2Engagement": item.get("adresse2Engagement"), "epreuves": item.get("epreuves"), "dateDebut": date_debut, "serie": item.get("serie"), "dateFin": date_fin, "dateValidation": date_validation, "codePostalEngagement": item.get("codePostalEngagement"), "codeClub": item.get("codeClub"), "prixEspece": item.get("prixEspece"), "japPhoneNumber": None, # Will be populated by separate umpire call # Additional fields from the response "adresse1Engagement": item.get("adresse1Engagement"), "originalId": item.get("originalId"), "familleTournoi": item.get("familleTournoi", []), "isTournoi": item.get("isTournoi"), "natureWithCatAge": item.get("natureWithCatAge"), } tournaments.append(tournament) # # logger.info( # f"Successfully parsed {len(tournaments)} tournaments from response" # ) return { "tournaments": tournaments, "total_results": total_results, "current_count": len(tournaments), } else: logger.error("No recherche_tournois_update command found in AJAX response") return {"tournaments": [], "total_results": 0, "current_count": 0} except Exception as e: logger.error(f"Error parsing AJAX response: {e}") return None def scrape_fft_all_tournaments( sorting_option=None, page=0, start_date=None, end_date=None, city="", distance=15, categories=None, levels=None, lat=None, lng=None, ages=None, tournament_types=None, national_cup=False, ): """ Scrapes FFT tournaments using Playwright with detailed debugging Based exactly on the working scrape_fft_club_tournaments function """ # logger.info(f"Starting Playwright scraping for city: {city}") try: with sync_playwright() as p: browser = get_browser_for_environment(p) page_obj = browser.new_page() page_obj.set_extra_http_headers( { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" } ) # Navigate to FFT target_url = "https://tenup.fft.fr/recherche/tournois" # # logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) page_obj.wait_for_timeout(500) current_url = page_obj.url # # logger.info(f"Current URL: {current_url}") if "queue-it.net" in current_url.lower(): # logger.warning("Still in Queue-It") browser.close() return None # Extract form_build_id form_input = page_obj.query_selector('input[name="form_build_id"]') if not form_input: # logger.error("Could not find form_build_id") browser.close() return None form_build_id = form_input.get_attribute("value") # # logger.info(f"Extracted form_build_id: {form_build_id}") # Build parameters - EXACT same pattern as club function date_component = "" if start_date and end_date: date_component = f"&date[start]={start_date}&date[end]={end_date}" elif start_date: try: start_dt = datetime.strptime(start_date, "%d/%m/%y") end_dt = start_dt + timedelta(days=90) date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}" except ValueError: logger.warning(f"Invalid date format: {start_date}") # Build filter parameters filter_params = "" # Add categories filter if categories: # # logger.info(f"Adding categories filter: {categories}") for category in categories: filter_params += f"&epreuve[{category}]={category}" # Add levels filter if levels: # # logger.info(f"Adding levels filter: {levels}") for level in levels: filter_params += f"&categorie_tournoi[{level}]={level}" # Add ages filter if ages: # # logger.info(f"Adding ages filter: {ages}") for age in ages: filter_params += f"&categorie_age[{age}]={age}" # Add types filter if tournament_types: # # logger.info(f"Adding types filter: {tournament_types}") for t_type in tournament_types: capitalized_type = t_type.capitalize() filter_params += f"&type[{capitalized_type}]={capitalized_type}" # Add national cup filter if national_cup: # # logger.info("Adding national cup filter") filter_params += "&tournoi_npc=1" # Fix the sorting parameter if sorting_option: sort_param = f"&sort={sorting_option}" else: sort_param = "&sort=dateDebut+asc" # Build city parameters with distance and location if city and city.strip(): city_name_encoded = city.strip().replace(" ", "+") # Start with the working base parameters base_params = f"recherche_type=ville&ville[autocomplete][value_container][value_field]={city_name_encoded}&ville[autocomplete][value_container][label_field]={city_name_encoded}" # Add distance parameter distance_param = f"&ville[distance][value_field]={int(distance)}" # Add lat/lng if provided location_params = "" if lat and lng: location_params = f"&ville[autocomplete][value_container][lat_field]={lat}&ville[autocomplete][value_container][lng_field]={lng}" # Combine all parameters including filters params = f"{base_params}{location_params}{distance_param}&pratique=PADEL{date_component}&page={page}{sort_param}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" else: # Default to ligue search if no city provided params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" # # logger.info(f"AJAX Parameters: {params}") # Make AJAX request and capture the full response - EXACT same as club function ajax_script = f""" async () => {{ try {{ const response = await fetch('https://tenup.fft.fr/system/ajax', {{ method: 'POST', headers: {{ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://tenup.fft.fr', 'Referer': 'https://tenup.fft.fr/recherche/tournois' }}, body: `{params}` }}); const status = response.status; const responseText = await response.text(); return {{ success: response.ok, status: status, responseText: responseText }}; }} catch (error) {{ return {{ success: false, error: error.message }}; }} }} """ # # logger.info("Making AJAX request...") result = page_obj.evaluate(ajax_script) browser.close() # Print the full response for debugging - EXACT same as club function # # logger.info(f"AJAX Response Status: {result.get('status')}") # # logger.info(f"AJAX Response Success: {result.get('success')}") if result.get("success"): response_text = result.get("responseText", "") # # logger.info(f"Raw Response Length: {len(response_text)}") try: # Try to parse as JSON json_data = json.loads(response_text) # # logger.info(f"JSON Response Type: {type(json_data)}") # Now try to parse it - EXACT same as club function parsed_result = _parse_ajax_response(json_data) return parsed_result except json.JSONDecodeError as json_error: # logger.error(f"JSON Parse Error: {json_error}") # logger.error(f"Response text: {response_text}") return None else: # logger.error(f"AJAX request failed: {result.get('error')}") return None except Exception as e: logger.error(f"Error in Playwright scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") return None def get_umpire_data(tournament_id): """ Umpire data extraction with improved Queue-It handling """ # # logger.info(f"Getting umpire data for tournament {tournament_id}") try: with sync_playwright() as p: browser = get_browser_for_environment(p) page = browser.new_page() # Navigate to tournament page url = f"https://tenup.fft.fr/tournoi/{tournament_id}" # # logger.info(f"Navigating to tournament page: {url}") try: # Navigate with reasonable timeout page.goto(url, timeout=30000, wait_until="domcontentloaded") # Enhanced Queue-It handling (similar to tournament search) if "queue-it.net" in page.url.lower(): # logger.warning(f"Queue-It detected for tournament {tournament_id}") # Wait strategy based on environment max_queue_wait = ( 120000 if not settings.DEBUG else 30000 ) # 2 min prod, 30s dev check_interval = 1000 # 10 seconds elapsed_time = 0 # # logger.info( # f"Waiting in queue for umpire data (max {max_queue_wait / 1000}s)..." # ) while ( elapsed_time < max_queue_wait and "queue-it.net" in page.url.lower() ): # # logger.info( # f"Umpire queue wait: {elapsed_time / 1000}s elapsed" # ) page.wait_for_timeout(check_interval) elapsed_time += check_interval # Check if we've been redirected if "queue-it.net" not in page.url.lower(): # # logger.info( # "Successfully passed through queue for umpire data!" # ) break # If still in queue, try fallback or return None if "queue-it.net" in page.url.lower(): # logger.warning( # "Umpire queue timeout - trying requests fallback" # ) browser.close() return _get_umpire_data_requests_fallback(tournament_id) # Wait for page to load properly page.wait_for_load_state("networkidle", timeout=20000) # Extract data using multiple strategies html_content = page.content() # Strategy 1: Try existing regex patterns name, email, phone = _extract_umpire_with_regex(html_content) # Strategy 2: If regex fails, try DOM selectors if not name and not email and not phone: name, email, phone = _extract_umpire_with_selectors(page) browser.close() if name or email or phone: # # logger.info( # f"Successfully extracted umpire data: name={name}, email={email}, phone={phone}" # ) return name, email, phone else: logger.warning( f"No umpire data found for tournament {tournament_id}" ) return None, None, None except Exception as page_error: logger.error( f"Error loading tournament page {tournament_id}: {page_error}" ) browser.close() # Try requests fallback # logger.info("Trying requests fallback after Playwright error") return _get_umpire_data_requests_fallback(tournament_id) except Exception as e: logger.error(f"Error in umpire data extraction for {tournament_id}: {e}") return None, None, None def _extract_umpire_with_regex(html_content): """ Extract umpire data using regex patterns """ # Extract name name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' name_match = re.search(name_pattern, html_content) name = name_match.group(1).strip() if name_match else None # Extract email email_pattern = r'mailto:([^"]+)"' email_match = re.search(email_pattern, html_content) email = email_match.group(1) if email_match else None # Extract phone - try multiple patterns phone_patterns = [ r'
\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*
', r"(\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{2})", r"(\d{10})", r"(\+33\s?\d{1}\s?\d{2}\s?\d{2}\s?\d{2}\s?\d{2})", ] phone = None for pattern in phone_patterns: phone_match = re.search(pattern, html_content) if phone_match: phone = phone_match.group(1).strip() break return name, email, phone def _extract_umpire_with_selectors(page): """ Extract umpire data using DOM selectors as fallback """ name = None email = None phone = None try: # Try different selectors for name name_selectors = [ ".tournoi-detail-page-inscription-responsable-title", '[class*="responsable-title"]', '[class*="umpire-name"]', 'h3:has-text("Responsable")', ] for selector in name_selectors: try: element = page.query_selector(selector) if element: name = element.inner_text().strip() if name: break except: continue # Try different selectors for email email_selectors = [ 'a[href^="mailto:"]', '[class*="email"]', ] for selector in email_selectors: try: element = page.query_selector(selector) if element: href = element.get_attribute("href") if href and href.startswith("mailto:"): email = href.replace("mailto:", "") break text = element.inner_text().strip() if "@" in text: email = text break except: continue # Try different selectors for phone phone_selectors = [ ".details-bloc", '[class*="phone"]', '[class*="telephone"]', ] for selector in phone_selectors: try: element = page.query_selector(selector) if element: text = element.inner_text().strip() # Check if text looks like a phone number if re.match(r"[\d\s\.\+\-\(\)]{8,}", text): phone = text break except: continue except Exception as e: logger.warning(f"Error in selector-based extraction: {e}") return name, email, phone def _get_umpire_data_requests_fallback(tournament_id): """ Enhanced fallback method using requests """ # logger.info(f"Using requests fallback for tournament {tournament_id}") try: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", } url = f"https://tenup.fft.fr/tournoi/{tournament_id}" response = requests.get(url, headers=headers, timeout=15) if "queue-it.net" in response.url: # logger.warning("Requests fallback also hit Queue-It") return None, None, None if response.status_code != 200: # logger.error(f"Requests fallback failed: {response.status_code}") return None, None, None # Try regex extraction on requests response name, email, phone = _extract_umpire_with_regex(response.text) # If regex fails, try BeautifulSoup if not name and not email and not phone: try: soup = BeautifulSoup(response.text, "html.parser") # Try to find name name_elements = soup.find_all( class_=lambda x: x and "responsable" in x.lower() ) if name_elements: name = name_elements[0].get_text().strip() # Try to find email email_links = soup.find_all( "a", href=lambda x: x and x.startswith("mailto:") ) if email_links: email = email_links[0]["href"].replace("mailto:", "") # Try to find phone in various elements for element in soup.find_all(text=True): if re.search( r"\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}", str(element), ): phone = str(element).strip() break except Exception as soup_error: logger.warning(f"BeautifulSoup parsing failed: {soup_error}") # logger.info( # f"Requests fallback result: name={name}, email={email}, phone={phone}" # ) return name, email, phone except Exception as e: logger.error(f"Requests fallback error: {e}") return None, None, None def _get_umpire_data_requests_fallback(tournament_id): """ Fallback method using requests (may hit Queue-It) """ # logger.info(f"Using requests fallback for tournament {tournament_id}") try: url = f"https://tenup.fft.fr/tournoi/{tournament_id}" headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" } response = requests.get(url, headers=headers, timeout=30) if response.status_code != 200: logger.error(f"Failed to fetch tournament page: {response.status_code}") return None, None, None html_content = response.text # Extract using regex (original method) name_pattern = ( r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' ) name_match = re.search(name_pattern, html_content) name = name_match.group(1).strip() if name_match else None email_pattern = r'mailto:([^"]+)"' email_match = re.search(email_pattern, html_content) email = email_match.group(1) if email_match else None phone_pattern = r'
\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*
' phone_match = re.search(phone_pattern, html_content) phone = phone_match.group(1).strip() if phone_match else None # logger.info( # f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}" # ) return name, email, phone except Exception as e: logger.error(f"Error getting umpire data with requests: {e}") return None, None, None def _scrape_single_page( sorting_option, page, start_date, end_date, city, distance, categories, levels, lat, lng, ages, tournament_types, national_cup, ): """ Helper function to scrape a single page of tournaments """ return scrape_fft_all_tournaments( sorting_option=sorting_option, page=page, start_date=start_date, end_date=end_date, city=city, distance=distance, categories=categories, levels=levels, lat=lat, lng=lng, ages=ages, tournament_types=tournament_types, national_cup=national_cup, ) def scrape_fft_all_tournaments_concurrent( sorting_option=None, start_date=None, end_date=None, city="", distance=15, categories=None, levels=None, lat=None, lng=None, ages=None, tournament_types=None, national_cup=False, max_workers=10, ): """ Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end) This assumes page 0 was already fetched by the client """ # logger.info(f"Starting concurrent scraping for remaining tournament pages") # First, get the first page to determine total results and pages first_page_result = scrape_fft_all_tournaments( sorting_option=sorting_option, page=0, start_date=start_date, end_date=end_date, city=city, distance=distance, categories=categories, levels=levels, lat=lat, lng=lng, ages=ages, tournament_types=tournament_types, national_cup=national_cup, ) if not first_page_result: logger.error("Failed to get first page results for pagination info") return None total_results = first_page_result.get("total_results", 0) first_page_tournaments = first_page_result.get("tournaments", []) results_per_page = len(first_page_tournaments) # logger.info(f"Total results: {total_results}, Results per page: {results_per_page}") if total_results == 0: return { "tournaments": [], "total_results": 0, "current_count": 0, "pages_scraped": 0, } # Calculate number of pages needed if results_per_page > 0: total_pages = (total_results + results_per_page - 1) // results_per_page else: total_pages = 1 # logger.info(f"Total pages: {total_pages}") # If only one page total, return empty since page 0 was already handled if total_pages <= 1: return { "tournaments": [], "total_results": total_results, "current_count": 0, "pages_scraped": 0, } # Scrape all remaining pages concurrently (pages 1 to total_pages-1) all_tournaments = [] max_concurrent = min(total_pages, 20) with ThreadPoolExecutor(max_workers=max_concurrent) as executor: futures = [] for page in range(1, total_pages): future = executor.submit( _scrape_single_page, sorting_option, page, start_date, end_date, city, distance, categories, levels, lat, lng, ages, tournament_types, national_cup, ) futures.append((page, future)) # Collect results as they complete for page, future in futures: try: result = future.result(timeout=60) # 60 second timeout per page if result and result.get("tournaments"): tournaments = result.get("tournaments", []) all_tournaments.extend(tournaments) # logger.info( # f"Page {page} completed: {len(tournaments)} tournaments" # ) else: logger.warning(f"Page {page} returned no results") except Exception as e: logger.error(f"Error processing page {page}: {e}") # logger.info( # f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages - 1} remaining pages" # ) return { "tournaments": all_tournaments, "total_results": total_results, "current_count": len(all_tournaments), "pages_scraped": total_pages - 1, # Excluding page 0 which was handled separately } def _parse_clubs_ajax_response(json_data): """ Parse the clubs AJAX response to match Swift FederalClubResponse structure """ try: # Log the raw response structure to understand what we're getting # # logger.info(f"Raw clubs response structure: {json_data}") club_markers = [] total_results = 0 # Try to extract clubs data from different possible response structures if isinstance(json_data, dict): # Pattern 1: Direct club_markers array if "club_markers" in json_data: clubs_data = json_data["club_markers"] total_results = json_data.get("nombreResultat", len(clubs_data)) # Pattern 2: Results wrapper elif "results" in json_data: results = json_data["results"] clubs_data = results.get( "clubs", results.get("items", results.get("club_markers", [])) ) total_results = results.get( "nombreResultat", results.get("total", results.get("nb_results", len(clubs_data))), ) # Pattern 3: Direct array in response elif "data" in json_data: clubs_data = json_data["data"] total_results = len(clubs_data) # Pattern 4: Response is the clubs array directly else: clubs_data = json_data if isinstance(json_data, list) else [] total_results = len(clubs_data) elif isinstance(json_data, list): clubs_data = json_data total_results = len(clubs_data) else: logger.error(f"Unexpected response format: {type(json_data)}") clubs_data = [] total_results = 0 # Parse each club to match ClubMarker structure for item in clubs_data: if isinstance(item, dict): # Extract pratiques array pratiques = [] if "pratiques" in item: pratiques = item["pratiques"] elif "practices" in item: pratiques = item["practices"] else: # Default to PADEL if not specified pratiques = ["PADEL"] # Ensure pratiques are uppercase strings pratiques = [ p.upper() if isinstance(p, str) else str(p).upper() for p in pratiques ] club_marker = { "nom": item.get("nom", item.get("name", "")), "clubId": str( item.get("clubId", item.get("id", item.get("code", ""))) ), "ville": item.get("ville", item.get("city", "")), "distance": str(item.get("distance", "0")), "terrainPratiqueLibelle": item.get( "terrainPratiqueLibelle", item.get("courtsInfo", "") ), "pratiques": pratiques, "lat": float(item.get("lat", item.get("latitude", 0.0))), "lng": float(item.get("lng", item.get("longitude", 0.0))), } club_markers.append(club_marker) # logger.info( # f"Successfully parsed {len(club_markers)} club markers from response" # ) # Return the response in the format expected by Swift FederalClubResponse return { "typeRecherche": "clubs", "nombreResultat": total_results, "club_markers": club_markers, } except Exception as e: logger.error(f"Error parsing clubs AJAX response: {e}") return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} def scrape_federal_clubs( country=None, city="", latitude=None, longitude=None, radius=15, max_workers=5 ): """ Scrapes FFT federal clubs by extracting data from the HTML response """ # logger.info(f"Starting federal clubs scraping for city: {city}, country: {country}") try: with sync_playwright() as p: browser = get_browser_for_environment(p) page_obj = browser.new_page() page_obj.set_extra_http_headers( { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" } ) # Clean up city name - remove zip code and extra info clean_city = city if city: clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip() clean_city = clean_city.rstrip(",").strip() # logger.info(f"Cleaned city name: '{city}' -> '{clean_city}'") # Build the results URL directly params = f"ville={clean_city}&pratique=PADEL&distance={int(radius)}&country={country or 'fr'}" results_url = f"https://tenup.fft.fr/recherche/clubs/resultats?{params}" # logger.info(f"Requesting results URL: {results_url}") # Navigate to the results page page_obj.goto(results_url) # Wait for the page to load page_obj.wait_for_timeout(2000) # Check if we're in queue if "queue-it.net" in page_obj.url.lower(): logger.warning("Hit Queue-It on results page") browser.close() return { "typeRecherche": "clubs", "nombreResultat": 0, "club_markers": [], } # Use JavaScript to extract the data directly from the page extraction_script = """ () => { try { // Check if Drupal.settings exists and has the data if (typeof Drupal !== 'undefined' && Drupal.settings && Drupal.settings.fft_recherche_club) { const data = Drupal.settings.fft_recherche_club; return { success: true, typeRecherche: data.typeRecherche || 'club', total: data.total || 0, resultat: data.resultat || [] }; } return { success: false, error: 'Drupal.settings.fft_recherche_club not found' }; } catch (error) { return { success: false, error: error.message }; } } """ result = page_obj.evaluate(extraction_script) browser.close() if result.get("success"): type_recherche = result.get("typeRecherche", "club") total = result.get("total", 0) resultat = result.get("resultat", []) # logger.info(f"Successfully extracted {total} clubs") # Convert resultat to club_markers format club_markers = [] for club in resultat: club_markers.append( { "nom": club.get("nom", ""), "clubId": club.get("clubId", ""), "ville": club.get("ville", ""), "distance": club.get("distance", ""), "terrainPratiqueLibelle": club.get( "terrainPratiqueLibelle", "" ), "pratiques": club.get("pratiques", []), "lat": club.get("lat", 0.0), "lng": club.get("lng", 0.0), } ) return { "typeRecherche": type_recherche, "nombreResultat": total, "club_markers": club_markers, } else: logger.error(f"Failed to extract data: {result.get('error')}") return { "typeRecherche": "clubs", "nombreResultat": 0, "club_markers": [], } except Exception as e: logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} def get_browser_for_environment(playwright_instance): """ Get appropriate browser based on environment - Development (DEBUG=True): Use Firefox (works better on macOS 15+) - Production (DEBUG=False): Use Chromium (as it was working) """ is_development = getattr(settings, "DEBUG", False) if is_development: # # logger.info("Development environment detected - using Firefox") try: return playwright_instance.firefox.launch( headless=True, args=["--no-sandbox"] ) except Exception as firefox_error: logger.warning( f"Firefox failed in dev, falling back to Chromium: {firefox_error}" ) return playwright_instance.chromium.launch( headless=True, args=["--no-sandbox", "--disable-dev-shm-usage", "--single-process"], ) else: # logger.info("Production environment detected - using Chromium") return playwright_instance.chromium.launch( headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"] )