From 1269b9776531da75bccb713d8ab7e43940f35909 Mon Sep 17 00:00:00 2001 From: Razmig Sarkissian Date: Thu, 25 Sep 2025 10:37:12 +0200 Subject: [PATCH 1/4] add waiting fft scraping --- api/utils.py | 490 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 300 insertions(+), 190 deletions(-) diff --git a/api/utils.py b/api/utils.py index 14fe6d2..ad78e5c 100644 --- a/api/utils.py +++ b/api/utils.py @@ -9,19 +9,23 @@ from concurrent.futures import ThreadPoolExecutor, as_completed logger = logging.getLogger(__name__) + def check_version_smaller_than_1_1_12(version_str): # Remove the parentheses part if it exists, example of version: 1.1.12 (2) version_str = version_str.split()[0] if version_str: # Split version into components - version_parts = [int(x) for x in version_str.split('.')] + version_parts = [int(x) for x in version_str.split(".")] target_parts = [1, 1, 12] # Compare version components return version_parts < target_parts else: return False -def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date=None, page=0): + +def scrape_fft_club_tournaments( + club_code, club_name, start_date=None, end_date=None, page=0 +): """ Scrapes FFT tournaments using Playwright with detailed debugging """ @@ -31,16 +35,18 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= browser = p.chromium.launch(headless=True) page_obj = browser.new_page() - page_obj.set_extra_http_headers({ - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" - }) + page_obj.set_extra_http_headers( + { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" + } + ) # Navigate to FFT target_url = "https://tenup.fft.fr/recherche/tournois" logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) - # page_obj.wait_for_timeout(7000) + page_obj.wait_for_timeout(7000) current_url = page_obj.url logger.info(f"Current URL: {current_url}") @@ -57,7 +63,7 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= browser.close() return None - form_build_id = form_input.get_attribute('value') + form_build_id = form_input.get_attribute("value") logger.info(f"Extracted form_build_id: {form_build_id}") # Build parameters @@ -121,8 +127,8 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= logger.info(f"AJAX Response Status: {result.get('status')}") logger.info(f"AJAX Response Success: {result.get('success')}") - if result.get('success'): - response_text = result.get('responseText', '') + if result.get("success"): + response_text = result.get("responseText", "") logger.info(f"Raw Response Length: {len(response_text)}") # logger.info(f"Raw Response (first 500 chars): {response_text[:500]}") @@ -151,7 +157,10 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= logger.error(f"Traceback: {traceback.format_exc()}") return None -def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, end_date=None): + +def scrape_fft_club_tournaments_all_pages( + club_code, club_name, start_date=None, end_date=None +): """ Scrapes all pages of FFT tournaments for a specific club """ @@ -168,7 +177,7 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, club_name=club_name, start_date=start_date, end_date=end_date, - page=page + page=page, ) # Debug: Log what we got @@ -178,7 +187,7 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, logger.warning(f"No result for page {page}") break - tournaments = result.get('tournaments', []) + tournaments = result.get("tournaments", []) logger.info(f"Page {page} returned {len(tournaments)} tournaments") if not tournaments: @@ -188,8 +197,10 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, all_tournaments.extend(tournaments) # Check if we have all results - total_results = result.get('total_results', 0) - logger.info(f"Total so far: {len(all_tournaments)}, Target: {total_results}") + total_results = result.get("total_results", 0) + logger.info( + f"Total so far: {len(all_tournaments)}, Target: {total_results}" + ) if len(all_tournaments) >= total_results: logger.info("Got all tournaments, stopping") @@ -203,15 +214,18 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, logger.error(f"Error on page {page}: {e}") break - logger.info(f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages") + logger.info( + f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages" + ) return { - 'tournaments': all_tournaments, - 'total_results': len(all_tournaments), - 'current_count': len(all_tournaments), - 'pages_scraped': page + 1 + "tournaments": all_tournaments, + "total_results": len(all_tournaments), + "current_count": len(all_tournaments), + "pages_scraped": page + 1, } + def _parse_ajax_response(commands): """ Parse the AJAX response commands to extract tournament data @@ -222,114 +236,133 @@ def _parse_ajax_response(commands): try: # Check for alert commands (maintenance mode) for command in commands: - if command.get('command') == 'alert': + if command.get("command") == "alert": logger.warning("Maintenance mode detected") return None # Find the command with results result_command = None for command in commands: - if command.get('command') == 'recherche_tournois_update': + if command.get("command") == "recherche_tournois_update": result_command = command logger.info("Found recherche_tournois_update command!") break - if result_command and result_command.get('results'): - results = result_command['results'] - items = results.get('items', []) - total_results = results.get('nb_results', 0) + if result_command and result_command.get("results"): + results = result_command["results"] + items = results.get("items", []) + total_results = results.get("nb_results", 0) logger.info(f"Processing {len(items)} tournaments from results") for item in items: # Parse dates - they're already in the correct format - date_debut = item.get('dateDebut') - date_fin = item.get('dateFin') - date_validation = item.get('dateValidation') + date_debut = item.get("dateDebut") + date_fin = item.get("dateFin") + date_validation = item.get("dateValidation") # Build the tournament object to match Swift FederalTournament structure tournament = { - "id": str(item.get('id', '')), - "millesime": item.get('millesime'), - "libelle": item.get('libelle'), - "tmc": item.get('tmc'), - "tarifAdulteChampionnat": item.get('tarifAdulteChampionnat'), - "type": item.get('type'), - "ageReel": item.get('ageReel'), - "naturesTerrains": item.get('naturesTerrains', []), - "idsArbitres": item.get('idsArbitres', []), - "tarifJeuneChampionnat": item.get('tarifJeuneChampionnat'), - "international": item.get('international'), - "inscriptionEnLigne": item.get('inscriptionEnLigne'), - "categorieTournoi": item.get('categorieTournoi'), - "prixLot": item.get('prixLot'), - "paiementEnLigne": item.get('paiementEnLigne'), - "reductionAdherentJeune": item.get('reductionAdherentJeune'), - "reductionAdherentAdulte": item.get('reductionAdherentAdulte'), - "paiementEnLigneObligatoire": item.get('paiementEnLigneObligatoire'), - "villeEngagement": item.get('villeEngagement'), - "senior": item.get('senior'), - "veteran": item.get('veteran'), - "inscriptionEnLigneEnCours": item.get('inscriptionEnLigneEnCours'), - "avecResultatPublie": item.get('avecResultatPublie'), - "code": item.get('code'), - "categorieAge": item.get('categorieAge'), - "codeComite": item.get('codeComite'), - "installations": item.get('installations', []), - "reductionEpreuveSupplementaireJeune": item.get('reductionEpreuveSupplementaireJeune'), - "reductionEpreuveSupplementaireAdulte": item.get('reductionEpreuveSupplementaireAdulte'), - "nomComite": item.get('nomComite'), - "naturesEpreuves": item.get('naturesEpreuves'), - "jeune": item.get('jeune'), - "courrielEngagement": item.get('courrielEngagement'), - "nomClub": item.get('nomClub'), - "installation": item.get('installation'), - "categorieAgeMax": item.get('categorieAgeMax'), - "tournoiInterne": item.get('tournoiInterne'), - "nomLigue": item.get('nomLigue'), - "nomEngagement": item.get('nomEngagement'), - "codeLigue": item.get('codeLigue'), - "modeleDeBalle": item.get('modeleDeBalle'), - "jugeArbitre": item.get('jugeArbitre'), - "adresse2Engagement": item.get('adresse2Engagement'), - "epreuves": item.get('epreuves'), + "id": str(item.get("id", "")), + "millesime": item.get("millesime"), + "libelle": item.get("libelle"), + "tmc": item.get("tmc"), + "tarifAdulteChampionnat": item.get("tarifAdulteChampionnat"), + "type": item.get("type"), + "ageReel": item.get("ageReel"), + "naturesTerrains": item.get("naturesTerrains", []), + "idsArbitres": item.get("idsArbitres", []), + "tarifJeuneChampionnat": item.get("tarifJeuneChampionnat"), + "international": item.get("international"), + "inscriptionEnLigne": item.get("inscriptionEnLigne"), + "categorieTournoi": item.get("categorieTournoi"), + "prixLot": item.get("prixLot"), + "paiementEnLigne": item.get("paiementEnLigne"), + "reductionAdherentJeune": item.get("reductionAdherentJeune"), + "reductionAdherentAdulte": item.get("reductionAdherentAdulte"), + "paiementEnLigneObligatoire": item.get( + "paiementEnLigneObligatoire" + ), + "villeEngagement": item.get("villeEngagement"), + "senior": item.get("senior"), + "veteran": item.get("veteran"), + "inscriptionEnLigneEnCours": item.get("inscriptionEnLigneEnCours"), + "avecResultatPublie": item.get("avecResultatPublie"), + "code": item.get("code"), + "categorieAge": item.get("categorieAge"), + "codeComite": item.get("codeComite"), + "installations": item.get("installations", []), + "reductionEpreuveSupplementaireJeune": item.get( + "reductionEpreuveSupplementaireJeune" + ), + "reductionEpreuveSupplementaireAdulte": item.get( + "reductionEpreuveSupplementaireAdulte" + ), + "nomComite": item.get("nomComite"), + "naturesEpreuves": item.get("naturesEpreuves"), + "jeune": item.get("jeune"), + "courrielEngagement": item.get("courrielEngagement"), + "nomClub": item.get("nomClub"), + "installation": item.get("installation"), + "categorieAgeMax": item.get("categorieAgeMax"), + "tournoiInterne": item.get("tournoiInterne"), + "nomLigue": item.get("nomLigue"), + "nomEngagement": item.get("nomEngagement"), + "codeLigue": item.get("codeLigue"), + "modeleDeBalle": item.get("modeleDeBalle"), + "jugeArbitre": item.get("jugeArbitre"), + "adresse2Engagement": item.get("adresse2Engagement"), + "epreuves": item.get("epreuves"), "dateDebut": date_debut, - "serie": item.get('serie'), + "serie": item.get("serie"), "dateFin": date_fin, "dateValidation": date_validation, - "codePostalEngagement": item.get('codePostalEngagement'), - "codeClub": item.get('codeClub'), - "prixEspece": item.get('prixEspece'), + "codePostalEngagement": item.get("codePostalEngagement"), + "codeClub": item.get("codeClub"), + "prixEspece": item.get("prixEspece"), "japPhoneNumber": None, # Will be populated by separate umpire call - # Additional fields from the response - "adresse1Engagement": item.get('adresse1Engagement'), - "originalId": item.get('originalId'), - "familleTournoi": item.get('familleTournoi', []), - "isTournoi": item.get('isTournoi'), - "natureWithCatAge": item.get('natureWithCatAge') + "adresse1Engagement": item.get("adresse1Engagement"), + "originalId": item.get("originalId"), + "familleTournoi": item.get("familleTournoi", []), + "isTournoi": item.get("isTournoi"), + "natureWithCatAge": item.get("natureWithCatAge"), } tournaments.append(tournament) - logger.info(f"Successfully parsed {len(tournaments)} tournaments from response") + logger.info( + f"Successfully parsed {len(tournaments)} tournaments from response" + ) return { - 'tournaments': tournaments, - 'total_results': total_results, - 'current_count': len(tournaments) + "tournaments": tournaments, + "total_results": total_results, + "current_count": len(tournaments), } else: logger.error("No recherche_tournois_update command found in AJAX response") - return {'tournaments': [], 'total_results': 0, 'current_count': 0} + return {"tournaments": [], "total_results": 0, "current_count": 0} except Exception as e: logger.error(f"Error parsing AJAX response: {e}") return None -def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end_date=None, - city='', distance=15, categories=None, levels=None, - lat=None, lng=None, ages=None, tournament_types=None, - national_cup=False): + +def scrape_fft_all_tournaments( + sorting_option=None, + page=0, + start_date=None, + end_date=None, + city="", + distance=15, + categories=None, + levels=None, + lat=None, + lng=None, + ages=None, + tournament_types=None, + national_cup=False, +): """ Scrapes FFT tournaments using Playwright with detailed debugging Based exactly on the working scrape_fft_club_tournaments function @@ -341,16 +374,18 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end browser = p.chromium.launch(headless=True) page_obj = browser.new_page() - page_obj.set_extra_http_headers({ - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" - }) + page_obj.set_extra_http_headers( + { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" + } + ) # Navigate to FFT target_url = "https://tenup.fft.fr/recherche/tournois" logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) - # page_obj.wait_for_timeout(7000) + page_obj.wait_for_timeout(7000) current_url = page_obj.url logger.info(f"Current URL: {current_url}") @@ -367,7 +402,7 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end browser.close() return None - form_build_id = form_input.get_attribute('value') + form_build_id = form_input.get_attribute("value") logger.info(f"Extracted form_build_id: {form_build_id}") # Build parameters - EXACT same pattern as club function @@ -486,8 +521,8 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end logger.info(f"AJAX Response Status: {result.get('status')}") logger.info(f"AJAX Response Success: {result.get('success')}") - if result.get('success'): - response_text = result.get('responseText', '') + if result.get("success"): + response_text = result.get("responseText", "") logger.info(f"Raw Response Length: {len(response_text)}") try: @@ -525,13 +560,13 @@ def get_umpire_data(tournament_id): browser = p.chromium.launch( headless=True, args=[ - '--no-sandbox', - '--disable-dev-shm-usage', - '--disable-images', # Don't load images - '--disable-javascript', # Disable JS for faster loading - '--disable-plugins', - '--disable-extensions' - ] + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-images", # Don't load images + "--disable-javascript", # Disable JS for faster loading + "--disable-plugins", + "--disable-extensions", + ], ) page = browser.new_page() @@ -553,7 +588,9 @@ def get_umpire_data(tournament_id): html_content = page.content() # Extract name - name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + name_pattern = ( + r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + ) name_match = re.search(name_pattern, html_content) name = name_match.group(1).strip() if name_match else None @@ -569,7 +606,9 @@ def get_umpire_data(tournament_id): browser.close() - logger.info(f"Extracted umpire data: name={name}, email={email}, phone={phone}") + logger.info( + f"Extracted umpire data: name={name}, email={email}, phone={phone}" + ) return name, email, phone except Exception as page_error: @@ -581,6 +620,7 @@ def get_umpire_data(tournament_id): logger.error(f"Error in umpire data extraction: {e}") return None, None, None + def _get_umpire_data_requests_fallback(tournament_id): """ Fallback method using requests (may hit Queue-It) @@ -591,7 +631,7 @@ def _get_umpire_data_requests_fallback(tournament_id): url = f"https://tenup.fft.fr/tournoi/{tournament_id}" headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15' + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" } response = requests.get(url, headers=headers, timeout=30) @@ -603,7 +643,9 @@ def _get_umpire_data_requests_fallback(tournament_id): html_content = response.text # Extract using regex (original method) - name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + name_pattern = ( + r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + ) name_match = re.search(name_pattern, html_content) name = name_match.group(1).strip() if name_match else None @@ -615,15 +657,31 @@ def _get_umpire_data_requests_fallback(tournament_id): phone_match = re.search(phone_pattern, html_content) phone = phone_match.group(1).strip() if phone_match else None - logger.info(f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}") + logger.info( + f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}" + ) return name, email, phone except Exception as e: logger.error(f"Error getting umpire data with requests: {e}") return None, None, None -def _scrape_single_page(sorting_option, page, start_date, end_date, city, distance, - categories, levels, lat, lng, ages, tournament_types, national_cup): + +def _scrape_single_page( + sorting_option, + page, + start_date, + end_date, + city, + distance, + categories, + levels, + lat, + lng, + ages, + tournament_types, + national_cup, +): """ Helper function to scrape a single page of tournaments """ @@ -640,14 +698,25 @@ def _scrape_single_page(sorting_option, page, start_date, end_date, city, distan lng=lng, ages=ages, tournament_types=tournament_types, - national_cup=national_cup + national_cup=national_cup, ) -def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, end_date=None, - city='', distance=15, categories=None, levels=None, - lat=None, lng=None, ages=None, tournament_types=None, - national_cup=False, max_workers=5): +def scrape_fft_all_tournaments_concurrent( + sorting_option=None, + start_date=None, + end_date=None, + city="", + distance=15, + categories=None, + levels=None, + lat=None, + lng=None, + ages=None, + tournament_types=None, + national_cup=False, + max_workers=5, +): """ Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end) This assumes page 0 was already fetched by the client @@ -668,21 +737,26 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, lng=lng, ages=ages, tournament_types=tournament_types, - national_cup=national_cup + national_cup=national_cup, ) if not first_page_result: logger.error("Failed to get first page results for pagination info") return None - total_results = first_page_result.get('total_results', 0) - first_page_tournaments = first_page_result.get('tournaments', []) + total_results = first_page_result.get("total_results", 0) + first_page_tournaments = first_page_result.get("tournaments", []) results_per_page = len(first_page_tournaments) logger.info(f"Total results: {total_results}, Results per page: {results_per_page}") if total_results == 0: - return {'tournaments': [], 'total_results': 0, 'current_count': 0, 'pages_scraped': 0} + return { + "tournaments": [], + "total_results": 0, + "current_count": 0, + "pages_scraped": 0, + } # Calculate number of pages needed if results_per_page > 0: @@ -694,7 +768,12 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, # If only one page total, return empty since page 0 was already handled if total_pages <= 1: - return {'tournaments': [], 'total_results': total_results, 'current_count': 0, 'pages_scraped': 0} + return { + "tournaments": [], + "total_results": total_results, + "current_count": 0, + "pages_scraped": 0, + } # Scrape all remaining pages concurrently (pages 1 to total_pages-1) all_tournaments = [] @@ -705,8 +784,19 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, for page in range(1, total_pages): future = executor.submit( _scrape_single_page, - sorting_option, page, start_date, end_date, city, distance, - categories, levels, lat, lng, ages, tournament_types, national_cup + sorting_option, + page, + start_date, + end_date, + city, + distance, + categories, + levels, + lat, + lng, + ages, + tournament_types, + national_cup, ) futures.append((page, future)) @@ -714,24 +804,30 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, for page, future in futures: try: result = future.result(timeout=60) # 60 second timeout per page - if result and result.get('tournaments'): - tournaments = result.get('tournaments', []) + if result and result.get("tournaments"): + tournaments = result.get("tournaments", []) all_tournaments.extend(tournaments) - logger.info(f"Page {page} completed: {len(tournaments)} tournaments") + logger.info( + f"Page {page} completed: {len(tournaments)} tournaments" + ) else: logger.warning(f"Page {page} returned no results") except Exception as e: logger.error(f"Error processing page {page}: {e}") - logger.info(f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages-1} remaining pages") + logger.info( + f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages - 1} remaining pages" + ) return { - 'tournaments': all_tournaments, - 'total_results': total_results, - 'current_count': len(all_tournaments), - 'pages_scraped': total_pages - 1 # Excluding page 0 which was handled separately + "tournaments": all_tournaments, + "total_results": total_results, + "current_count": len(all_tournaments), + "pages_scraped": total_pages + - 1, # Excluding page 0 which was handled separately } + def _parse_clubs_ajax_response(json_data): """ Parse the clubs AJAX response to match Swift FederalClubResponse structure @@ -746,19 +842,24 @@ def _parse_clubs_ajax_response(json_data): # Try to extract clubs data from different possible response structures if isinstance(json_data, dict): # Pattern 1: Direct club_markers array - if 'club_markers' in json_data: - clubs_data = json_data['club_markers'] - total_results = json_data.get('nombreResultat', len(clubs_data)) + if "club_markers" in json_data: + clubs_data = json_data["club_markers"] + total_results = json_data.get("nombreResultat", len(clubs_data)) # Pattern 2: Results wrapper - elif 'results' in json_data: - results = json_data['results'] - clubs_data = results.get('clubs', results.get('items', results.get('club_markers', []))) - total_results = results.get('nombreResultat', results.get('total', results.get('nb_results', len(clubs_data)))) + elif "results" in json_data: + results = json_data["results"] + clubs_data = results.get( + "clubs", results.get("items", results.get("club_markers", [])) + ) + total_results = results.get( + "nombreResultat", + results.get("total", results.get("nb_results", len(clubs_data))), + ) # Pattern 3: Direct array in response - elif 'data' in json_data: - clubs_data = json_data['data'] + elif "data" in json_data: + clubs_data = json_data["data"] total_results = len(clubs_data) # Pattern 4: Response is the clubs array directly @@ -780,49 +881,55 @@ def _parse_clubs_ajax_response(json_data): if isinstance(item, dict): # Extract pratiques array pratiques = [] - if 'pratiques' in item: - pratiques = item['pratiques'] - elif 'practices' in item: - pratiques = item['practices'] + if "pratiques" in item: + pratiques = item["pratiques"] + elif "practices" in item: + pratiques = item["practices"] else: # Default to PADEL if not specified pratiques = ["PADEL"] # Ensure pratiques are uppercase strings - pratiques = [p.upper() if isinstance(p, str) else str(p).upper() for p in pratiques] + pratiques = [ + p.upper() if isinstance(p, str) else str(p).upper() + for p in pratiques + ] club_marker = { - "nom": item.get('nom', item.get('name', '')), - "clubId": str(item.get('clubId', item.get('id', item.get('code', '')))), - "ville": item.get('ville', item.get('city', '')), - "distance": str(item.get('distance', '0')), - "terrainPratiqueLibelle": item.get('terrainPratiqueLibelle', item.get('courtsInfo', '')), + "nom": item.get("nom", item.get("name", "")), + "clubId": str( + item.get("clubId", item.get("id", item.get("code", ""))) + ), + "ville": item.get("ville", item.get("city", "")), + "distance": str(item.get("distance", "0")), + "terrainPratiqueLibelle": item.get( + "terrainPratiqueLibelle", item.get("courtsInfo", "") + ), "pratiques": pratiques, - "lat": float(item.get('lat', item.get('latitude', 0.0))), - "lng": float(item.get('lng', item.get('longitude', 0.0))) + "lat": float(item.get("lat", item.get("latitude", 0.0))), + "lng": float(item.get("lng", item.get("longitude", 0.0))), } club_markers.append(club_marker) - logger.info(f"Successfully parsed {len(club_markers)} club markers from response") + logger.info( + f"Successfully parsed {len(club_markers)} club markers from response" + ) # Return the response in the format expected by Swift FederalClubResponse return { "typeRecherche": "clubs", "nombreResultat": total_results, - "club_markers": club_markers + "club_markers": club_markers, } except Exception as e: logger.error(f"Error parsing clubs AJAX response: {e}") - return { - "typeRecherche": "clubs", - "nombreResultat": 0, - "club_markers": [] - } + return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} -def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, - radius=15, max_workers=5): +def scrape_federal_clubs( + country=None, city="", latitude=None, longitude=None, radius=15, max_workers=5 +): """ Scrapes FFT federal clubs by extracting data from the HTML response """ @@ -833,16 +940,19 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, browser = p.chromium.launch(headless=True) page_obj = browser.new_page() - page_obj.set_extra_http_headers({ - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" - }) + page_obj.set_extra_http_headers( + { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" + } + ) # Clean up city name - remove zip code and extra info clean_city = city if city: import re - clean_city = re.sub(r'[,\s]*\d{5}.*$', '', city).strip() - clean_city = clean_city.rstrip(',').strip() + + clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip() + clean_city = clean_city.rstrip(",").strip() logger.info(f"Cleaned city name: '{city}' -> '{clean_city}'") @@ -856,7 +966,7 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, page_obj.goto(results_url) # Wait for the page to load - page_obj.wait_for_timeout(3000) + page_obj.wait_for_timeout(7000) # Check if we're in queue if "queue-it.net" in page_obj.url.lower(): @@ -865,7 +975,7 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, return { "typeRecherche": "clubs", "nombreResultat": 0, - "club_markers": [] + "club_markers": [], } # Use JavaScript to extract the data directly from the page @@ -904,45 +1014,45 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, browser.close() - if result.get('success'): - type_recherche = result.get('typeRecherche', 'club') - total = result.get('total', 0) - resultat = result.get('resultat', []) + if result.get("success"): + type_recherche = result.get("typeRecherche", "club") + total = result.get("total", 0) + resultat = result.get("resultat", []) logger.info(f"Successfully extracted {total} clubs") # Convert resultat to club_markers format club_markers = [] for club in resultat: - club_markers.append({ - "nom": club.get('nom', ''), - "clubId": club.get('clubId', ''), - "ville": club.get('ville', ''), - "distance": club.get('distance', ''), - "terrainPratiqueLibelle": club.get('terrainPratiqueLibelle', ''), - "pratiques": club.get('pratiques', []), - "lat": club.get('lat', 0.0), - "lng": club.get('lng', 0.0) - }) + club_markers.append( + { + "nom": club.get("nom", ""), + "clubId": club.get("clubId", ""), + "ville": club.get("ville", ""), + "distance": club.get("distance", ""), + "terrainPratiqueLibelle": club.get( + "terrainPratiqueLibelle", "" + ), + "pratiques": club.get("pratiques", []), + "lat": club.get("lat", 0.0), + "lng": club.get("lng", 0.0), + } + ) return { "typeRecherche": type_recherche, "nombreResultat": total, - "club_markers": club_markers + "club_markers": club_markers, } else: logger.error(f"Failed to extract data: {result.get('error')}") return { "typeRecherche": "clubs", "nombreResultat": 0, - "club_markers": [] + "club_markers": [], } except Exception as e: logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") - return { - "typeRecherche": "clubs", - "nombreResultat": 0, - "club_markers": [] - } + return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} From 7d997fdb7d865c233adb9bc153f940ffbf46449a Mon Sep 17 00:00:00 2001 From: Razmig Sarkissian Date: Thu, 25 Sep 2025 10:42:09 +0200 Subject: [PATCH 2/4] Remove hardcoded page wait timeouts --- api/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/utils.py b/api/utils.py index ad78e5c..50ae798 100644 --- a/api/utils.py +++ b/api/utils.py @@ -46,7 +46,7 @@ def scrape_fft_club_tournaments( logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) - page_obj.wait_for_timeout(7000) + # page_obj.wait_for_timeout(7000) current_url = page_obj.url logger.info(f"Current URL: {current_url}") @@ -385,7 +385,7 @@ def scrape_fft_all_tournaments( logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) - page_obj.wait_for_timeout(7000) + # page_obj.wait_for_timeout(7000) current_url = page_obj.url logger.info(f"Current URL: {current_url}") @@ -966,7 +966,7 @@ def scrape_federal_clubs( page_obj.goto(results_url) # Wait for the page to load - page_obj.wait_for_timeout(7000) + page_obj.wait_for_timeout(3000) # Check if we're in queue if "queue-it.net" in page_obj.url.lower(): From 34d8fac0d53b934e0bcd6c62df8fef2c7809845f Mon Sep 17 00:00:00 2001 From: Razmig Sarkissian Date: Thu, 25 Sep 2025 10:49:25 +0200 Subject: [PATCH 3/4] Refactor FFT tournament scraping with Queue-It fallback --- api/utils.py | 417 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 242 insertions(+), 175 deletions(-) diff --git a/api/utils.py b/api/utils.py index 50ae798..8fcc960 100644 --- a/api/utils.py +++ b/api/utils.py @@ -364,189 +364,77 @@ def scrape_fft_all_tournaments( national_cup=False, ): """ - Scrapes FFT tournaments using Playwright with detailed debugging - Based exactly on the working scrape_fft_club_tournaments function + Scrapes FFT tournaments with Queue-It fallback to club-based approach """ - logger.info(f"Starting Playwright scraping for city: {city}") + logger.info(f"Starting tournament scraping for city: {city}") + # First try the original direct approach try: - with sync_playwright() as p: - browser = p.chromium.launch(headless=True) - page_obj = browser.new_page() - - page_obj.set_extra_http_headers( - { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" - } - ) - - # Navigate to FFT - target_url = "https://tenup.fft.fr/recherche/tournois" - logger.info(f"Navigating to: {target_url}") - - page_obj.goto(target_url) - # page_obj.wait_for_timeout(7000) - - current_url = page_obj.url - logger.info(f"Current URL: {current_url}") - - if "queue-it.net" in current_url.lower(): - logger.warning("Still in Queue-It") - browser.close() - return None - - # Extract form_build_id - form_input = page_obj.query_selector('input[name="form_build_id"]') - if not form_input: - logger.error("Could not find form_build_id") - browser.close() - return None - - form_build_id = form_input.get_attribute("value") - logger.info(f"Extracted form_build_id: {form_build_id}") - - # Build parameters - EXACT same pattern as club function - date_component = "" - if start_date and end_date: - date_component = f"&date[start]={start_date}&date[end]={end_date}" - elif start_date: - try: - start_dt = datetime.strptime(start_date, "%d/%m/%y") - end_dt = start_dt + timedelta(days=90) - date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}" - except ValueError: - logger.warning(f"Invalid date format: {start_date}") - - # Build filter parameters - filter_params = "" - - # Add categories filter - if categories: - logger.info(f"Adding categories filter: {categories}") - for category in categories: - filter_params += f"&epreuve[{category}]={category}" - - # Add levels filter - if levels: - logger.info(f"Adding levels filter: {levels}") - for level in levels: - filter_params += f"&categorie_tournoi[{level}]={level}" - - # Add ages filter - if ages: - logger.info(f"Adding ages filter: {ages}") - for age in ages: - filter_params += f"&categorie_age[{age}]={age}" - - # Add types filter - if tournament_types: - logger.info(f"Adding types filter: {tournament_types}") - for t_type in tournament_types: - capitalized_type = t_type.capitalize() - filter_params += f"&type[{capitalized_type}]={capitalized_type}" - - # Add national cup filter - if national_cup: - logger.info("Adding national cup filter") - filter_params += "&tournoi_npc=1" - - # Fix the sorting parameter - if sorting_option: - sort_param = f"&sort={sorting_option}" - else: - sort_param = "&sort=dateDebut+asc" - - # Build city parameters with distance and location - if city and city.strip(): - city_name_encoded = city.strip().replace(" ", "+") - - # Start with the working base parameters - base_params = f"recherche_type=ville&ville[autocomplete][value_container][value_field]={city_name_encoded}&ville[autocomplete][value_container][label_field]={city_name_encoded}" - - # Add distance parameter - distance_param = f"&ville[distance][value_field]={int(distance)}" - - # Add lat/lng if provided - location_params = "" - if lat and lng: - location_params = f"&ville[autocomplete][value_container][lat_field]={lat}&ville[autocomplete][value_container][lng_field]={lng}" - - # Combine all parameters including filters - params = f"{base_params}{location_params}{distance_param}&pratique=PADEL{date_component}&page={page}{sort_param}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" - else: - # Default to ligue search if no city provided - params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" - - logger.info(f"AJAX Parameters: {params}") - - # Make AJAX request and capture the full response - EXACT same as club function - ajax_script = f""" - async () => {{ - try {{ - const response = await fetch('https://tenup.fft.fr/system/ajax', {{ - method: 'POST', - headers: {{ - 'Accept': 'application/json, text/javascript, */*; q=0.01', - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'X-Requested-With': 'XMLHttpRequest', - 'Origin': 'https://tenup.fft.fr', - 'Referer': 'https://tenup.fft.fr/recherche/tournois' - }}, - body: `{params}` - }}); - - const status = response.status; - const responseText = await response.text(); - - return {{ - success: response.ok, - status: status, - responseText: responseText - }}; - }} catch (error) {{ - return {{ - success: false, - error: error.message - }}; - }} - }} - """ - - logger.info("Making AJAX request...") - result = page_obj.evaluate(ajax_script) - - browser.close() - - # Print the full response for debugging - EXACT same as club function - logger.info(f"AJAX Response Status: {result.get('status')}") - logger.info(f"AJAX Response Success: {result.get('success')}") - - if result.get("success"): - response_text = result.get("responseText", "") - logger.info(f"Raw Response Length: {len(response_text)}") + logger.info("Attempting direct location-based search...") + result = scrape_fft_all_tournaments_original( + sorting_option=sorting_option, + page=page, + start_date=start_date, + end_date=end_date, + city=city, + distance=distance, + categories=categories, + levels=levels, + lat=lat, + lng=lng, + ages=ages, + tournament_types=tournament_types, + national_cup=national_cup, + ) - try: - # Try to parse as JSON - json_data = json.loads(response_text) - logger.info(f"JSON Response Type: {type(json_data)}") + if result is not None: + logger.info("Direct search successful") + return result - # Now try to parse it - EXACT same as club function - parsed_result = _parse_ajax_response(json_data) + except Exception as e: + logger.warning(f"Direct search failed: {e}") + + # Fallback to club-based approach + if city: # Only use fallback if we have a city to search for clubs + logger.info("Falling back to club-based search...") + return scrape_fft_all_tournaments_via_clubs( + sorting_option=sorting_option, + page=page, + start_date=start_date, + end_date=end_date, + city=city, + distance=distance, + categories=categories, + levels=levels, + lat=lat, + lng=lng, + ages=ages, + tournament_types=tournament_types, + national_cup=national_cup, + ) - return parsed_result + logger.error("Both direct and club-based approaches failed") + return None - except json.JSONDecodeError as json_error: - logger.error(f"JSON Parse Error: {json_error}") - logger.error(f"Response text: {response_text}") - return None - else: - logger.error(f"AJAX request failed: {result.get('error')}") - return None - except Exception as e: - logger.error(f"Error in Playwright scraping: {e}") - logger.error(f"Traceback: {traceback.format_exc()}") - return None +# Rename the original function +def scrape_fft_all_tournaments_original( + sorting_option=None, + page=0, + start_date=None, + end_date=None, + city="", + distance=15, + categories=None, + levels=None, + lat=None, + lng=None, + ages=None, + tournament_types=None, + national_cup=False, +): + # [Your existing direct scraping code here] + pass def get_umpire_data(tournament_id): @@ -1056,3 +944,182 @@ def scrape_federal_clubs( logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} + + +def scrape_fft_all_tournaments_via_clubs( + sorting_option=None, + page=0, + start_date=None, + end_date=None, + city="", + distance=15, + categories=None, + levels=None, + lat=None, + lng=None, + ages=None, + tournament_types=None, + national_cup=False, +): + """ + Get tournaments by location using the working club-based approach + 1. Get clubs in the area using scrape_federal_clubs + 2. Get tournaments for each club using scrape_fft_club_tournaments + 3. Aggregate and filter results + """ + logger.info( + f"Starting hybrid club-based tournament scraping for city: {city}, distance: {distance}km" + ) + + try: + # Step 1: Get clubs in the area + logger.info("Step 1: Getting clubs in the area...") + clubs_result = scrape_federal_clubs( + country="fr", city=city, latitude=lat, longitude=lng, radius=distance + ) + + if not clubs_result or clubs_result.get("nombreResultat", 0) == 0: + logger.warning(f"No clubs found for city: {city}") + return {"tournaments": [], "total_results": 0, "current_count": 0} + + clubs = clubs_result.get("club_markers", []) + logger.info(f"Found {len(clubs)} clubs in {city} area") + + # Step 2: Get tournaments for each club + all_tournaments = [] + processed_clubs = 0 + max_clubs = 50 # Limit to prevent too many requests + + for club in clubs[:max_clubs]: + club_code = club.get("codeClub") + club_name = club.get("nomClub") + + if not club_code or not club_name: + continue + + logger.info(f"Getting tournaments for club: {club_name} ({club_code})") + + try: + # Get tournaments for this club + club_result = scrape_fft_club_tournaments( + club_code=club_code, + club_name=club_name, + start_date=start_date, + end_date=end_date, + page=0, # Always get first page for each club + ) + + if club_result and club_result.get("tournaments"): + tournaments = club_result["tournaments"] + + # Apply additional filters that might not be handled by club search + filtered_tournaments = [] + for tournament in tournaments: + # Apply category filter + if categories: + tournament_category = tournament.get("categorieTournoi", "") + if not any( + cat.lower() in tournament_category.lower() + for cat in categories + ): + continue + + # Apply level filter + if levels: + tournament_level = tournament.get("niveau", "") + if not any( + level.lower() in tournament_level.lower() + for level in levels + ): + continue + + # Apply age filter + if ages: + tournament_ages = tournament.get("categorieAge", "") + if not any( + age.lower() in tournament_ages.lower() for age in ages + ): + continue + + # Apply type filter + if tournament_types: + tournament_type = tournament.get("type", "") + if not any( + t_type.lower() in tournament_type.lower() + for t_type in tournament_types + ): + continue + + # Apply national cup filter + if national_cup: + is_national_cup = tournament.get("tournoi_npc", False) + if not is_national_cup: + continue + + filtered_tournaments.append(tournament) + + all_tournaments.extend(filtered_tournaments) + logger.info( + f"Added {len(filtered_tournaments)} tournaments from {club_name}" + ) + + processed_clubs += 1 + + # Add small delay to be respectful + import time + + time.sleep(0.5) + + except Exception as club_error: + logger.warning( + f"Error getting tournaments for club {club_name}: {club_error}" + ) + continue + + logger.info( + f"Processed {processed_clubs} clubs, found {len(all_tournaments)} total tournaments" + ) + + # Step 3: Remove duplicates (tournaments might appear in multiple club searches) + unique_tournaments = [] + seen_tournament_ids = set() + + for tournament in all_tournaments: + tournament_id = tournament.get("id") + if tournament_id and tournament_id not in seen_tournament_ids: + unique_tournaments.append(tournament) + seen_tournament_ids.add(tournament_id) + + logger.info( + f"After deduplication: {len(unique_tournaments)} unique tournaments" + ) + + # Step 4: Sort tournaments + if sorting_option == "dateDebut+asc" or not sorting_option: + unique_tournaments.sort( + key=lambda x: x.get("dateDebut", {}).get("date", "") + ) + elif sorting_option == "dateDebut+desc": + unique_tournaments.sort( + key=lambda x: x.get("dateDebut", {}).get("date", ""), reverse=True + ) + # Note: Distance sorting would require additional calculation + + # Step 5: Handle pagination + tournaments_per_page = 20 + start_idx = page * tournaments_per_page + end_idx = start_idx + tournaments_per_page + + page_tournaments = unique_tournaments[start_idx:end_idx] + + return { + "tournaments": page_tournaments, + "total_results": len(unique_tournaments), + "current_count": len(page_tournaments), + "method": "club_based_hybrid", + } + + except Exception as e: + logger.error(f"Error in hybrid club-based scraping: {e}") + logger.error(f"Traceback: {traceback.format_exc()}") + return None From a7cbf4c6a66d8d29ecc064423d9a5fc3b0b12764 Mon Sep 17 00:00:00 2001 From: Razmig Sarkissian Date: Thu, 25 Sep 2025 10:53:55 +0200 Subject: [PATCH 4/4] Refactor FFT tournament scraping using Playwright with detailed error handling --- api/utils.py | 417 +++++++++++++++++++++------------------------------ 1 file changed, 175 insertions(+), 242 deletions(-) diff --git a/api/utils.py b/api/utils.py index 8fcc960..50ae798 100644 --- a/api/utils.py +++ b/api/utils.py @@ -364,77 +364,189 @@ def scrape_fft_all_tournaments( national_cup=False, ): """ - Scrapes FFT tournaments with Queue-It fallback to club-based approach + Scrapes FFT tournaments using Playwright with detailed debugging + Based exactly on the working scrape_fft_club_tournaments function """ - logger.info(f"Starting tournament scraping for city: {city}") + logger.info(f"Starting Playwright scraping for city: {city}") - # First try the original direct approach try: - logger.info("Attempting direct location-based search...") - result = scrape_fft_all_tournaments_original( - sorting_option=sorting_option, - page=page, - start_date=start_date, - end_date=end_date, - city=city, - distance=distance, - categories=categories, - levels=levels, - lat=lat, - lng=lng, - ages=ages, - tournament_types=tournament_types, - national_cup=national_cup, - ) + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page_obj = browser.new_page() - if result is not None: - logger.info("Direct search successful") - return result + page_obj.set_extra_http_headers( + { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" + } + ) - except Exception as e: - logger.warning(f"Direct search failed: {e}") - - # Fallback to club-based approach - if city: # Only use fallback if we have a city to search for clubs - logger.info("Falling back to club-based search...") - return scrape_fft_all_tournaments_via_clubs( - sorting_option=sorting_option, - page=page, - start_date=start_date, - end_date=end_date, - city=city, - distance=distance, - categories=categories, - levels=levels, - lat=lat, - lng=lng, - ages=ages, - tournament_types=tournament_types, - national_cup=national_cup, - ) + # Navigate to FFT + target_url = "https://tenup.fft.fr/recherche/tournois" + logger.info(f"Navigating to: {target_url}") - logger.error("Both direct and club-based approaches failed") - return None + page_obj.goto(target_url) + # page_obj.wait_for_timeout(7000) + current_url = page_obj.url + logger.info(f"Current URL: {current_url}") -# Rename the original function -def scrape_fft_all_tournaments_original( - sorting_option=None, - page=0, - start_date=None, - end_date=None, - city="", - distance=15, - categories=None, - levels=None, - lat=None, - lng=None, - ages=None, - tournament_types=None, - national_cup=False, -): - # [Your existing direct scraping code here] - pass + if "queue-it.net" in current_url.lower(): + logger.warning("Still in Queue-It") + browser.close() + return None + + # Extract form_build_id + form_input = page_obj.query_selector('input[name="form_build_id"]') + if not form_input: + logger.error("Could not find form_build_id") + browser.close() + return None + + form_build_id = form_input.get_attribute("value") + logger.info(f"Extracted form_build_id: {form_build_id}") + + # Build parameters - EXACT same pattern as club function + date_component = "" + if start_date and end_date: + date_component = f"&date[start]={start_date}&date[end]={end_date}" + elif start_date: + try: + start_dt = datetime.strptime(start_date, "%d/%m/%y") + end_dt = start_dt + timedelta(days=90) + date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}" + except ValueError: + logger.warning(f"Invalid date format: {start_date}") + + # Build filter parameters + filter_params = "" + + # Add categories filter + if categories: + logger.info(f"Adding categories filter: {categories}") + for category in categories: + filter_params += f"&epreuve[{category}]={category}" + + # Add levels filter + if levels: + logger.info(f"Adding levels filter: {levels}") + for level in levels: + filter_params += f"&categorie_tournoi[{level}]={level}" + + # Add ages filter + if ages: + logger.info(f"Adding ages filter: {ages}") + for age in ages: + filter_params += f"&categorie_age[{age}]={age}" + + # Add types filter + if tournament_types: + logger.info(f"Adding types filter: {tournament_types}") + for t_type in tournament_types: + capitalized_type = t_type.capitalize() + filter_params += f"&type[{capitalized_type}]={capitalized_type}" + + # Add national cup filter + if national_cup: + logger.info("Adding national cup filter") + filter_params += "&tournoi_npc=1" + + # Fix the sorting parameter + if sorting_option: + sort_param = f"&sort={sorting_option}" + else: + sort_param = "&sort=dateDebut+asc" + + # Build city parameters with distance and location + if city and city.strip(): + city_name_encoded = city.strip().replace(" ", "+") + + # Start with the working base parameters + base_params = f"recherche_type=ville&ville[autocomplete][value_container][value_field]={city_name_encoded}&ville[autocomplete][value_container][label_field]={city_name_encoded}" + + # Add distance parameter + distance_param = f"&ville[distance][value_field]={int(distance)}" + + # Add lat/lng if provided + location_params = "" + if lat and lng: + location_params = f"&ville[autocomplete][value_container][lat_field]={lat}&ville[autocomplete][value_container][lng_field]={lng}" + + # Combine all parameters including filters + params = f"{base_params}{location_params}{distance_param}&pratique=PADEL{date_component}&page={page}{sort_param}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" + else: + # Default to ligue search if no city provided + params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" + + logger.info(f"AJAX Parameters: {params}") + + # Make AJAX request and capture the full response - EXACT same as club function + ajax_script = f""" + async () => {{ + try {{ + const response = await fetch('https://tenup.fft.fr/system/ajax', {{ + method: 'POST', + headers: {{ + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'X-Requested-With': 'XMLHttpRequest', + 'Origin': 'https://tenup.fft.fr', + 'Referer': 'https://tenup.fft.fr/recherche/tournois' + }}, + body: `{params}` + }}); + + const status = response.status; + const responseText = await response.text(); + + return {{ + success: response.ok, + status: status, + responseText: responseText + }}; + }} catch (error) {{ + return {{ + success: false, + error: error.message + }}; + }} + }} + """ + + logger.info("Making AJAX request...") + result = page_obj.evaluate(ajax_script) + + browser.close() + + # Print the full response for debugging - EXACT same as club function + logger.info(f"AJAX Response Status: {result.get('status')}") + logger.info(f"AJAX Response Success: {result.get('success')}") + + if result.get("success"): + response_text = result.get("responseText", "") + logger.info(f"Raw Response Length: {len(response_text)}") + + try: + # Try to parse as JSON + json_data = json.loads(response_text) + logger.info(f"JSON Response Type: {type(json_data)}") + + # Now try to parse it - EXACT same as club function + parsed_result = _parse_ajax_response(json_data) + + return parsed_result + + except json.JSONDecodeError as json_error: + logger.error(f"JSON Parse Error: {json_error}") + logger.error(f"Response text: {response_text}") + return None + else: + logger.error(f"AJAX request failed: {result.get('error')}") + return None + + except Exception as e: + logger.error(f"Error in Playwright scraping: {e}") + logger.error(f"Traceback: {traceback.format_exc()}") + return None def get_umpire_data(tournament_id): @@ -944,182 +1056,3 @@ def scrape_federal_clubs( logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} - - -def scrape_fft_all_tournaments_via_clubs( - sorting_option=None, - page=0, - start_date=None, - end_date=None, - city="", - distance=15, - categories=None, - levels=None, - lat=None, - lng=None, - ages=None, - tournament_types=None, - national_cup=False, -): - """ - Get tournaments by location using the working club-based approach - 1. Get clubs in the area using scrape_federal_clubs - 2. Get tournaments for each club using scrape_fft_club_tournaments - 3. Aggregate and filter results - """ - logger.info( - f"Starting hybrid club-based tournament scraping for city: {city}, distance: {distance}km" - ) - - try: - # Step 1: Get clubs in the area - logger.info("Step 1: Getting clubs in the area...") - clubs_result = scrape_federal_clubs( - country="fr", city=city, latitude=lat, longitude=lng, radius=distance - ) - - if not clubs_result or clubs_result.get("nombreResultat", 0) == 0: - logger.warning(f"No clubs found for city: {city}") - return {"tournaments": [], "total_results": 0, "current_count": 0} - - clubs = clubs_result.get("club_markers", []) - logger.info(f"Found {len(clubs)} clubs in {city} area") - - # Step 2: Get tournaments for each club - all_tournaments = [] - processed_clubs = 0 - max_clubs = 50 # Limit to prevent too many requests - - for club in clubs[:max_clubs]: - club_code = club.get("codeClub") - club_name = club.get("nomClub") - - if not club_code or not club_name: - continue - - logger.info(f"Getting tournaments for club: {club_name} ({club_code})") - - try: - # Get tournaments for this club - club_result = scrape_fft_club_tournaments( - club_code=club_code, - club_name=club_name, - start_date=start_date, - end_date=end_date, - page=0, # Always get first page for each club - ) - - if club_result and club_result.get("tournaments"): - tournaments = club_result["tournaments"] - - # Apply additional filters that might not be handled by club search - filtered_tournaments = [] - for tournament in tournaments: - # Apply category filter - if categories: - tournament_category = tournament.get("categorieTournoi", "") - if not any( - cat.lower() in tournament_category.lower() - for cat in categories - ): - continue - - # Apply level filter - if levels: - tournament_level = tournament.get("niveau", "") - if not any( - level.lower() in tournament_level.lower() - for level in levels - ): - continue - - # Apply age filter - if ages: - tournament_ages = tournament.get("categorieAge", "") - if not any( - age.lower() in tournament_ages.lower() for age in ages - ): - continue - - # Apply type filter - if tournament_types: - tournament_type = tournament.get("type", "") - if not any( - t_type.lower() in tournament_type.lower() - for t_type in tournament_types - ): - continue - - # Apply national cup filter - if national_cup: - is_national_cup = tournament.get("tournoi_npc", False) - if not is_national_cup: - continue - - filtered_tournaments.append(tournament) - - all_tournaments.extend(filtered_tournaments) - logger.info( - f"Added {len(filtered_tournaments)} tournaments from {club_name}" - ) - - processed_clubs += 1 - - # Add small delay to be respectful - import time - - time.sleep(0.5) - - except Exception as club_error: - logger.warning( - f"Error getting tournaments for club {club_name}: {club_error}" - ) - continue - - logger.info( - f"Processed {processed_clubs} clubs, found {len(all_tournaments)} total tournaments" - ) - - # Step 3: Remove duplicates (tournaments might appear in multiple club searches) - unique_tournaments = [] - seen_tournament_ids = set() - - for tournament in all_tournaments: - tournament_id = tournament.get("id") - if tournament_id and tournament_id not in seen_tournament_ids: - unique_tournaments.append(tournament) - seen_tournament_ids.add(tournament_id) - - logger.info( - f"After deduplication: {len(unique_tournaments)} unique tournaments" - ) - - # Step 4: Sort tournaments - if sorting_option == "dateDebut+asc" or not sorting_option: - unique_tournaments.sort( - key=lambda x: x.get("dateDebut", {}).get("date", "") - ) - elif sorting_option == "dateDebut+desc": - unique_tournaments.sort( - key=lambda x: x.get("dateDebut", {}).get("date", ""), reverse=True - ) - # Note: Distance sorting would require additional calculation - - # Step 5: Handle pagination - tournaments_per_page = 20 - start_idx = page * tournaments_per_page - end_idx = start_idx + tournaments_per_page - - page_tournaments = unique_tournaments[start_idx:end_idx] - - return { - "tournaments": page_tournaments, - "total_results": len(unique_tournaments), - "current_count": len(page_tournaments), - "method": "club_based_hybrid", - } - - except Exception as e: - logger.error(f"Error in hybrid club-based scraping: {e}") - logger.error(f"Traceback: {traceback.format_exc()}") - return None