From 34d8fac0d53b934e0bcd6c62df8fef2c7809845f Mon Sep 17 00:00:00 2001 From: Razmig Sarkissian Date: Thu, 25 Sep 2025 10:49:25 +0200 Subject: [PATCH] Refactor FFT tournament scraping with Queue-It fallback --- api/utils.py | 417 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 242 insertions(+), 175 deletions(-) diff --git a/api/utils.py b/api/utils.py index 50ae798..8fcc960 100644 --- a/api/utils.py +++ b/api/utils.py @@ -364,189 +364,77 @@ def scrape_fft_all_tournaments( national_cup=False, ): """ - Scrapes FFT tournaments using Playwright with detailed debugging - Based exactly on the working scrape_fft_club_tournaments function + Scrapes FFT tournaments with Queue-It fallback to club-based approach """ - logger.info(f"Starting Playwright scraping for city: {city}") + logger.info(f"Starting tournament scraping for city: {city}") + # First try the original direct approach try: - with sync_playwright() as p: - browser = p.chromium.launch(headless=True) - page_obj = browser.new_page() - - page_obj.set_extra_http_headers( - { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" - } - ) - - # Navigate to FFT - target_url = "https://tenup.fft.fr/recherche/tournois" - logger.info(f"Navigating to: {target_url}") - - page_obj.goto(target_url) - # page_obj.wait_for_timeout(7000) - - current_url = page_obj.url - logger.info(f"Current URL: {current_url}") - - if "queue-it.net" in current_url.lower(): - logger.warning("Still in Queue-It") - browser.close() - return None - - # Extract form_build_id - form_input = page_obj.query_selector('input[name="form_build_id"]') - if not form_input: - logger.error("Could not find form_build_id") - browser.close() - return None - - form_build_id = form_input.get_attribute("value") - logger.info(f"Extracted form_build_id: {form_build_id}") - - # Build parameters - EXACT same pattern as club function - date_component = "" - if start_date and end_date: - date_component = f"&date[start]={start_date}&date[end]={end_date}" - elif start_date: - try: - start_dt = datetime.strptime(start_date, "%d/%m/%y") - end_dt = start_dt + timedelta(days=90) - date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}" - except ValueError: - logger.warning(f"Invalid date format: {start_date}") - - # Build filter parameters - filter_params = "" - - # Add categories filter - if categories: - logger.info(f"Adding categories filter: {categories}") - for category in categories: - filter_params += f"&epreuve[{category}]={category}" - - # Add levels filter - if levels: - logger.info(f"Adding levels filter: {levels}") - for level in levels: - filter_params += f"&categorie_tournoi[{level}]={level}" - - # Add ages filter - if ages: - logger.info(f"Adding ages filter: {ages}") - for age in ages: - filter_params += f"&categorie_age[{age}]={age}" - - # Add types filter - if tournament_types: - logger.info(f"Adding types filter: {tournament_types}") - for t_type in tournament_types: - capitalized_type = t_type.capitalize() - filter_params += f"&type[{capitalized_type}]={capitalized_type}" - - # Add national cup filter - if national_cup: - logger.info("Adding national cup filter") - filter_params += "&tournoi_npc=1" - - # Fix the sorting parameter - if sorting_option: - sort_param = f"&sort={sorting_option}" - else: - sort_param = "&sort=dateDebut+asc" - - # Build city parameters with distance and location - if city and city.strip(): - city_name_encoded = city.strip().replace(" ", "+") - - # Start with the working base parameters - base_params = f"recherche_type=ville&ville[autocomplete][value_container][value_field]={city_name_encoded}&ville[autocomplete][value_container][label_field]={city_name_encoded}" - - # Add distance parameter - distance_param = f"&ville[distance][value_field]={int(distance)}" - - # Add lat/lng if provided - location_params = "" - if lat and lng: - location_params = f"&ville[autocomplete][value_container][lat_field]={lat}&ville[autocomplete][value_container][lng_field]={lng}" - - # Combine all parameters including filters - params = f"{base_params}{location_params}{distance_param}&pratique=PADEL{date_component}&page={page}{sort_param}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" - else: - # Default to ligue search if no city provided - params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" - - logger.info(f"AJAX Parameters: {params}") - - # Make AJAX request and capture the full response - EXACT same as club function - ajax_script = f""" - async () => {{ - try {{ - const response = await fetch('https://tenup.fft.fr/system/ajax', {{ - method: 'POST', - headers: {{ - 'Accept': 'application/json, text/javascript, */*; q=0.01', - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'X-Requested-With': 'XMLHttpRequest', - 'Origin': 'https://tenup.fft.fr', - 'Referer': 'https://tenup.fft.fr/recherche/tournois' - }}, - body: `{params}` - }}); - - const status = response.status; - const responseText = await response.text(); - - return {{ - success: response.ok, - status: status, - responseText: responseText - }}; - }} catch (error) {{ - return {{ - success: false, - error: error.message - }}; - }} - }} - """ - - logger.info("Making AJAX request...") - result = page_obj.evaluate(ajax_script) - - browser.close() - - # Print the full response for debugging - EXACT same as club function - logger.info(f"AJAX Response Status: {result.get('status')}") - logger.info(f"AJAX Response Success: {result.get('success')}") - - if result.get("success"): - response_text = result.get("responseText", "") - logger.info(f"Raw Response Length: {len(response_text)}") + logger.info("Attempting direct location-based search...") + result = scrape_fft_all_tournaments_original( + sorting_option=sorting_option, + page=page, + start_date=start_date, + end_date=end_date, + city=city, + distance=distance, + categories=categories, + levels=levels, + lat=lat, + lng=lng, + ages=ages, + tournament_types=tournament_types, + national_cup=national_cup, + ) - try: - # Try to parse as JSON - json_data = json.loads(response_text) - logger.info(f"JSON Response Type: {type(json_data)}") + if result is not None: + logger.info("Direct search successful") + return result - # Now try to parse it - EXACT same as club function - parsed_result = _parse_ajax_response(json_data) + except Exception as e: + logger.warning(f"Direct search failed: {e}") + + # Fallback to club-based approach + if city: # Only use fallback if we have a city to search for clubs + logger.info("Falling back to club-based search...") + return scrape_fft_all_tournaments_via_clubs( + sorting_option=sorting_option, + page=page, + start_date=start_date, + end_date=end_date, + city=city, + distance=distance, + categories=categories, + levels=levels, + lat=lat, + lng=lng, + ages=ages, + tournament_types=tournament_types, + national_cup=national_cup, + ) - return parsed_result + logger.error("Both direct and club-based approaches failed") + return None - except json.JSONDecodeError as json_error: - logger.error(f"JSON Parse Error: {json_error}") - logger.error(f"Response text: {response_text}") - return None - else: - logger.error(f"AJAX request failed: {result.get('error')}") - return None - except Exception as e: - logger.error(f"Error in Playwright scraping: {e}") - logger.error(f"Traceback: {traceback.format_exc()}") - return None +# Rename the original function +def scrape_fft_all_tournaments_original( + sorting_option=None, + page=0, + start_date=None, + end_date=None, + city="", + distance=15, + categories=None, + levels=None, + lat=None, + lng=None, + ages=None, + tournament_types=None, + national_cup=False, +): + # [Your existing direct scraping code here] + pass def get_umpire_data(tournament_id): @@ -1056,3 +944,182 @@ def scrape_federal_clubs( logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} + + +def scrape_fft_all_tournaments_via_clubs( + sorting_option=None, + page=0, + start_date=None, + end_date=None, + city="", + distance=15, + categories=None, + levels=None, + lat=None, + lng=None, + ages=None, + tournament_types=None, + national_cup=False, +): + """ + Get tournaments by location using the working club-based approach + 1. Get clubs in the area using scrape_federal_clubs + 2. Get tournaments for each club using scrape_fft_club_tournaments + 3. Aggregate and filter results + """ + logger.info( + f"Starting hybrid club-based tournament scraping for city: {city}, distance: {distance}km" + ) + + try: + # Step 1: Get clubs in the area + logger.info("Step 1: Getting clubs in the area...") + clubs_result = scrape_federal_clubs( + country="fr", city=city, latitude=lat, longitude=lng, radius=distance + ) + + if not clubs_result or clubs_result.get("nombreResultat", 0) == 0: + logger.warning(f"No clubs found for city: {city}") + return {"tournaments": [], "total_results": 0, "current_count": 0} + + clubs = clubs_result.get("club_markers", []) + logger.info(f"Found {len(clubs)} clubs in {city} area") + + # Step 2: Get tournaments for each club + all_tournaments = [] + processed_clubs = 0 + max_clubs = 50 # Limit to prevent too many requests + + for club in clubs[:max_clubs]: + club_code = club.get("codeClub") + club_name = club.get("nomClub") + + if not club_code or not club_name: + continue + + logger.info(f"Getting tournaments for club: {club_name} ({club_code})") + + try: + # Get tournaments for this club + club_result = scrape_fft_club_tournaments( + club_code=club_code, + club_name=club_name, + start_date=start_date, + end_date=end_date, + page=0, # Always get first page for each club + ) + + if club_result and club_result.get("tournaments"): + tournaments = club_result["tournaments"] + + # Apply additional filters that might not be handled by club search + filtered_tournaments = [] + for tournament in tournaments: + # Apply category filter + if categories: + tournament_category = tournament.get("categorieTournoi", "") + if not any( + cat.lower() in tournament_category.lower() + for cat in categories + ): + continue + + # Apply level filter + if levels: + tournament_level = tournament.get("niveau", "") + if not any( + level.lower() in tournament_level.lower() + for level in levels + ): + continue + + # Apply age filter + if ages: + tournament_ages = tournament.get("categorieAge", "") + if not any( + age.lower() in tournament_ages.lower() for age in ages + ): + continue + + # Apply type filter + if tournament_types: + tournament_type = tournament.get("type", "") + if not any( + t_type.lower() in tournament_type.lower() + for t_type in tournament_types + ): + continue + + # Apply national cup filter + if national_cup: + is_national_cup = tournament.get("tournoi_npc", False) + if not is_national_cup: + continue + + filtered_tournaments.append(tournament) + + all_tournaments.extend(filtered_tournaments) + logger.info( + f"Added {len(filtered_tournaments)} tournaments from {club_name}" + ) + + processed_clubs += 1 + + # Add small delay to be respectful + import time + + time.sleep(0.5) + + except Exception as club_error: + logger.warning( + f"Error getting tournaments for club {club_name}: {club_error}" + ) + continue + + logger.info( + f"Processed {processed_clubs} clubs, found {len(all_tournaments)} total tournaments" + ) + + # Step 3: Remove duplicates (tournaments might appear in multiple club searches) + unique_tournaments = [] + seen_tournament_ids = set() + + for tournament in all_tournaments: + tournament_id = tournament.get("id") + if tournament_id and tournament_id not in seen_tournament_ids: + unique_tournaments.append(tournament) + seen_tournament_ids.add(tournament_id) + + logger.info( + f"After deduplication: {len(unique_tournaments)} unique tournaments" + ) + + # Step 4: Sort tournaments + if sorting_option == "dateDebut+asc" or not sorting_option: + unique_tournaments.sort( + key=lambda x: x.get("dateDebut", {}).get("date", "") + ) + elif sorting_option == "dateDebut+desc": + unique_tournaments.sort( + key=lambda x: x.get("dateDebut", {}).get("date", ""), reverse=True + ) + # Note: Distance sorting would require additional calculation + + # Step 5: Handle pagination + tournaments_per_page = 20 + start_idx = page * tournaments_per_page + end_idx = start_idx + tournaments_per_page + + page_tournaments = unique_tournaments[start_idx:end_idx] + + return { + "tournaments": page_tournaments, + "total_results": len(unique_tournaments), + "current_count": len(page_tournaments), + "method": "club_based_hybrid", + } + + except Exception as e: + logger.error(f"Error in hybrid club-based scraping: {e}") + logger.error(f"Traceback: {traceback.format_exc()}") + return None