From 1269b9776531da75bccb713d8ab7e43940f35909 Mon Sep 17 00:00:00 2001 From: Razmig Sarkissian Date: Thu, 25 Sep 2025 10:37:12 +0200 Subject: [PATCH] add waiting fft scraping --- api/utils.py | 490 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 300 insertions(+), 190 deletions(-) diff --git a/api/utils.py b/api/utils.py index 14fe6d2..ad78e5c 100644 --- a/api/utils.py +++ b/api/utils.py @@ -9,19 +9,23 @@ from concurrent.futures import ThreadPoolExecutor, as_completed logger = logging.getLogger(__name__) + def check_version_smaller_than_1_1_12(version_str): # Remove the parentheses part if it exists, example of version: 1.1.12 (2) version_str = version_str.split()[0] if version_str: # Split version into components - version_parts = [int(x) for x in version_str.split('.')] + version_parts = [int(x) for x in version_str.split(".")] target_parts = [1, 1, 12] # Compare version components return version_parts < target_parts else: return False -def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date=None, page=0): + +def scrape_fft_club_tournaments( + club_code, club_name, start_date=None, end_date=None, page=0 +): """ Scrapes FFT tournaments using Playwright with detailed debugging """ @@ -31,16 +35,18 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= browser = p.chromium.launch(headless=True) page_obj = browser.new_page() - page_obj.set_extra_http_headers({ - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" - }) + page_obj.set_extra_http_headers( + { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" + } + ) # Navigate to FFT target_url = "https://tenup.fft.fr/recherche/tournois" logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) - # page_obj.wait_for_timeout(7000) + page_obj.wait_for_timeout(7000) current_url = page_obj.url logger.info(f"Current URL: {current_url}") @@ -57,7 +63,7 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= browser.close() return None - form_build_id = form_input.get_attribute('value') + form_build_id = form_input.get_attribute("value") logger.info(f"Extracted form_build_id: {form_build_id}") # Build parameters @@ -121,8 +127,8 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= logger.info(f"AJAX Response Status: {result.get('status')}") logger.info(f"AJAX Response Success: {result.get('success')}") - if result.get('success'): - response_text = result.get('responseText', '') + if result.get("success"): + response_text = result.get("responseText", "") logger.info(f"Raw Response Length: {len(response_text)}") # logger.info(f"Raw Response (first 500 chars): {response_text[:500]}") @@ -151,7 +157,10 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= logger.error(f"Traceback: {traceback.format_exc()}") return None -def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, end_date=None): + +def scrape_fft_club_tournaments_all_pages( + club_code, club_name, start_date=None, end_date=None +): """ Scrapes all pages of FFT tournaments for a specific club """ @@ -168,7 +177,7 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, club_name=club_name, start_date=start_date, end_date=end_date, - page=page + page=page, ) # Debug: Log what we got @@ -178,7 +187,7 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, logger.warning(f"No result for page {page}") break - tournaments = result.get('tournaments', []) + tournaments = result.get("tournaments", []) logger.info(f"Page {page} returned {len(tournaments)} tournaments") if not tournaments: @@ -188,8 +197,10 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, all_tournaments.extend(tournaments) # Check if we have all results - total_results = result.get('total_results', 0) - logger.info(f"Total so far: {len(all_tournaments)}, Target: {total_results}") + total_results = result.get("total_results", 0) + logger.info( + f"Total so far: {len(all_tournaments)}, Target: {total_results}" + ) if len(all_tournaments) >= total_results: logger.info("Got all tournaments, stopping") @@ -203,15 +214,18 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, logger.error(f"Error on page {page}: {e}") break - logger.info(f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages") + logger.info( + f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages" + ) return { - 'tournaments': all_tournaments, - 'total_results': len(all_tournaments), - 'current_count': len(all_tournaments), - 'pages_scraped': page + 1 + "tournaments": all_tournaments, + "total_results": len(all_tournaments), + "current_count": len(all_tournaments), + "pages_scraped": page + 1, } + def _parse_ajax_response(commands): """ Parse the AJAX response commands to extract tournament data @@ -222,114 +236,133 @@ def _parse_ajax_response(commands): try: # Check for alert commands (maintenance mode) for command in commands: - if command.get('command') == 'alert': + if command.get("command") == "alert": logger.warning("Maintenance mode detected") return None # Find the command with results result_command = None for command in commands: - if command.get('command') == 'recherche_tournois_update': + if command.get("command") == "recherche_tournois_update": result_command = command logger.info("Found recherche_tournois_update command!") break - if result_command and result_command.get('results'): - results = result_command['results'] - items = results.get('items', []) - total_results = results.get('nb_results', 0) + if result_command and result_command.get("results"): + results = result_command["results"] + items = results.get("items", []) + total_results = results.get("nb_results", 0) logger.info(f"Processing {len(items)} tournaments from results") for item in items: # Parse dates - they're already in the correct format - date_debut = item.get('dateDebut') - date_fin = item.get('dateFin') - date_validation = item.get('dateValidation') + date_debut = item.get("dateDebut") + date_fin = item.get("dateFin") + date_validation = item.get("dateValidation") # Build the tournament object to match Swift FederalTournament structure tournament = { - "id": str(item.get('id', '')), - "millesime": item.get('millesime'), - "libelle": item.get('libelle'), - "tmc": item.get('tmc'), - "tarifAdulteChampionnat": item.get('tarifAdulteChampionnat'), - "type": item.get('type'), - "ageReel": item.get('ageReel'), - "naturesTerrains": item.get('naturesTerrains', []), - "idsArbitres": item.get('idsArbitres', []), - "tarifJeuneChampionnat": item.get('tarifJeuneChampionnat'), - "international": item.get('international'), - "inscriptionEnLigne": item.get('inscriptionEnLigne'), - "categorieTournoi": item.get('categorieTournoi'), - "prixLot": item.get('prixLot'), - "paiementEnLigne": item.get('paiementEnLigne'), - "reductionAdherentJeune": item.get('reductionAdherentJeune'), - "reductionAdherentAdulte": item.get('reductionAdherentAdulte'), - "paiementEnLigneObligatoire": item.get('paiementEnLigneObligatoire'), - "villeEngagement": item.get('villeEngagement'), - "senior": item.get('senior'), - "veteran": item.get('veteran'), - "inscriptionEnLigneEnCours": item.get('inscriptionEnLigneEnCours'), - "avecResultatPublie": item.get('avecResultatPublie'), - "code": item.get('code'), - "categorieAge": item.get('categorieAge'), - "codeComite": item.get('codeComite'), - "installations": item.get('installations', []), - "reductionEpreuveSupplementaireJeune": item.get('reductionEpreuveSupplementaireJeune'), - "reductionEpreuveSupplementaireAdulte": item.get('reductionEpreuveSupplementaireAdulte'), - "nomComite": item.get('nomComite'), - "naturesEpreuves": item.get('naturesEpreuves'), - "jeune": item.get('jeune'), - "courrielEngagement": item.get('courrielEngagement'), - "nomClub": item.get('nomClub'), - "installation": item.get('installation'), - "categorieAgeMax": item.get('categorieAgeMax'), - "tournoiInterne": item.get('tournoiInterne'), - "nomLigue": item.get('nomLigue'), - "nomEngagement": item.get('nomEngagement'), - "codeLigue": item.get('codeLigue'), - "modeleDeBalle": item.get('modeleDeBalle'), - "jugeArbitre": item.get('jugeArbitre'), - "adresse2Engagement": item.get('adresse2Engagement'), - "epreuves": item.get('epreuves'), + "id": str(item.get("id", "")), + "millesime": item.get("millesime"), + "libelle": item.get("libelle"), + "tmc": item.get("tmc"), + "tarifAdulteChampionnat": item.get("tarifAdulteChampionnat"), + "type": item.get("type"), + "ageReel": item.get("ageReel"), + "naturesTerrains": item.get("naturesTerrains", []), + "idsArbitres": item.get("idsArbitres", []), + "tarifJeuneChampionnat": item.get("tarifJeuneChampionnat"), + "international": item.get("international"), + "inscriptionEnLigne": item.get("inscriptionEnLigne"), + "categorieTournoi": item.get("categorieTournoi"), + "prixLot": item.get("prixLot"), + "paiementEnLigne": item.get("paiementEnLigne"), + "reductionAdherentJeune": item.get("reductionAdherentJeune"), + "reductionAdherentAdulte": item.get("reductionAdherentAdulte"), + "paiementEnLigneObligatoire": item.get( + "paiementEnLigneObligatoire" + ), + "villeEngagement": item.get("villeEngagement"), + "senior": item.get("senior"), + "veteran": item.get("veteran"), + "inscriptionEnLigneEnCours": item.get("inscriptionEnLigneEnCours"), + "avecResultatPublie": item.get("avecResultatPublie"), + "code": item.get("code"), + "categorieAge": item.get("categorieAge"), + "codeComite": item.get("codeComite"), + "installations": item.get("installations", []), + "reductionEpreuveSupplementaireJeune": item.get( + "reductionEpreuveSupplementaireJeune" + ), + "reductionEpreuveSupplementaireAdulte": item.get( + "reductionEpreuveSupplementaireAdulte" + ), + "nomComite": item.get("nomComite"), + "naturesEpreuves": item.get("naturesEpreuves"), + "jeune": item.get("jeune"), + "courrielEngagement": item.get("courrielEngagement"), + "nomClub": item.get("nomClub"), + "installation": item.get("installation"), + "categorieAgeMax": item.get("categorieAgeMax"), + "tournoiInterne": item.get("tournoiInterne"), + "nomLigue": item.get("nomLigue"), + "nomEngagement": item.get("nomEngagement"), + "codeLigue": item.get("codeLigue"), + "modeleDeBalle": item.get("modeleDeBalle"), + "jugeArbitre": item.get("jugeArbitre"), + "adresse2Engagement": item.get("adresse2Engagement"), + "epreuves": item.get("epreuves"), "dateDebut": date_debut, - "serie": item.get('serie'), + "serie": item.get("serie"), "dateFin": date_fin, "dateValidation": date_validation, - "codePostalEngagement": item.get('codePostalEngagement'), - "codeClub": item.get('codeClub'), - "prixEspece": item.get('prixEspece'), + "codePostalEngagement": item.get("codePostalEngagement"), + "codeClub": item.get("codeClub"), + "prixEspece": item.get("prixEspece"), "japPhoneNumber": None, # Will be populated by separate umpire call - # Additional fields from the response - "adresse1Engagement": item.get('adresse1Engagement'), - "originalId": item.get('originalId'), - "familleTournoi": item.get('familleTournoi', []), - "isTournoi": item.get('isTournoi'), - "natureWithCatAge": item.get('natureWithCatAge') + "adresse1Engagement": item.get("adresse1Engagement"), + "originalId": item.get("originalId"), + "familleTournoi": item.get("familleTournoi", []), + "isTournoi": item.get("isTournoi"), + "natureWithCatAge": item.get("natureWithCatAge"), } tournaments.append(tournament) - logger.info(f"Successfully parsed {len(tournaments)} tournaments from response") + logger.info( + f"Successfully parsed {len(tournaments)} tournaments from response" + ) return { - 'tournaments': tournaments, - 'total_results': total_results, - 'current_count': len(tournaments) + "tournaments": tournaments, + "total_results": total_results, + "current_count": len(tournaments), } else: logger.error("No recherche_tournois_update command found in AJAX response") - return {'tournaments': [], 'total_results': 0, 'current_count': 0} + return {"tournaments": [], "total_results": 0, "current_count": 0} except Exception as e: logger.error(f"Error parsing AJAX response: {e}") return None -def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end_date=None, - city='', distance=15, categories=None, levels=None, - lat=None, lng=None, ages=None, tournament_types=None, - national_cup=False): + +def scrape_fft_all_tournaments( + sorting_option=None, + page=0, + start_date=None, + end_date=None, + city="", + distance=15, + categories=None, + levels=None, + lat=None, + lng=None, + ages=None, + tournament_types=None, + national_cup=False, +): """ Scrapes FFT tournaments using Playwright with detailed debugging Based exactly on the working scrape_fft_club_tournaments function @@ -341,16 +374,18 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end browser = p.chromium.launch(headless=True) page_obj = browser.new_page() - page_obj.set_extra_http_headers({ - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" - }) + page_obj.set_extra_http_headers( + { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" + } + ) # Navigate to FFT target_url = "https://tenup.fft.fr/recherche/tournois" logger.info(f"Navigating to: {target_url}") page_obj.goto(target_url) - # page_obj.wait_for_timeout(7000) + page_obj.wait_for_timeout(7000) current_url = page_obj.url logger.info(f"Current URL: {current_url}") @@ -367,7 +402,7 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end browser.close() return None - form_build_id = form_input.get_attribute('value') + form_build_id = form_input.get_attribute("value") logger.info(f"Extracted form_build_id: {form_build_id}") # Build parameters - EXACT same pattern as club function @@ -486,8 +521,8 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end logger.info(f"AJAX Response Status: {result.get('status')}") logger.info(f"AJAX Response Success: {result.get('success')}") - if result.get('success'): - response_text = result.get('responseText', '') + if result.get("success"): + response_text = result.get("responseText", "") logger.info(f"Raw Response Length: {len(response_text)}") try: @@ -525,13 +560,13 @@ def get_umpire_data(tournament_id): browser = p.chromium.launch( headless=True, args=[ - '--no-sandbox', - '--disable-dev-shm-usage', - '--disable-images', # Don't load images - '--disable-javascript', # Disable JS for faster loading - '--disable-plugins', - '--disable-extensions' - ] + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-images", # Don't load images + "--disable-javascript", # Disable JS for faster loading + "--disable-plugins", + "--disable-extensions", + ], ) page = browser.new_page() @@ -553,7 +588,9 @@ def get_umpire_data(tournament_id): html_content = page.content() # Extract name - name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + name_pattern = ( + r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + ) name_match = re.search(name_pattern, html_content) name = name_match.group(1).strip() if name_match else None @@ -569,7 +606,9 @@ def get_umpire_data(tournament_id): browser.close() - logger.info(f"Extracted umpire data: name={name}, email={email}, phone={phone}") + logger.info( + f"Extracted umpire data: name={name}, email={email}, phone={phone}" + ) return name, email, phone except Exception as page_error: @@ -581,6 +620,7 @@ def get_umpire_data(tournament_id): logger.error(f"Error in umpire data extraction: {e}") return None, None, None + def _get_umpire_data_requests_fallback(tournament_id): """ Fallback method using requests (may hit Queue-It) @@ -591,7 +631,7 @@ def _get_umpire_data_requests_fallback(tournament_id): url = f"https://tenup.fft.fr/tournoi/{tournament_id}" headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15' + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" } response = requests.get(url, headers=headers, timeout=30) @@ -603,7 +643,9 @@ def _get_umpire_data_requests_fallback(tournament_id): html_content = response.text # Extract using regex (original method) - name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + name_pattern = ( + r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' + ) name_match = re.search(name_pattern, html_content) name = name_match.group(1).strip() if name_match else None @@ -615,15 +657,31 @@ def _get_umpire_data_requests_fallback(tournament_id): phone_match = re.search(phone_pattern, html_content) phone = phone_match.group(1).strip() if phone_match else None - logger.info(f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}") + logger.info( + f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}" + ) return name, email, phone except Exception as e: logger.error(f"Error getting umpire data with requests: {e}") return None, None, None -def _scrape_single_page(sorting_option, page, start_date, end_date, city, distance, - categories, levels, lat, lng, ages, tournament_types, national_cup): + +def _scrape_single_page( + sorting_option, + page, + start_date, + end_date, + city, + distance, + categories, + levels, + lat, + lng, + ages, + tournament_types, + national_cup, +): """ Helper function to scrape a single page of tournaments """ @@ -640,14 +698,25 @@ def _scrape_single_page(sorting_option, page, start_date, end_date, city, distan lng=lng, ages=ages, tournament_types=tournament_types, - national_cup=national_cup + national_cup=national_cup, ) -def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, end_date=None, - city='', distance=15, categories=None, levels=None, - lat=None, lng=None, ages=None, tournament_types=None, - national_cup=False, max_workers=5): +def scrape_fft_all_tournaments_concurrent( + sorting_option=None, + start_date=None, + end_date=None, + city="", + distance=15, + categories=None, + levels=None, + lat=None, + lng=None, + ages=None, + tournament_types=None, + national_cup=False, + max_workers=5, +): """ Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end) This assumes page 0 was already fetched by the client @@ -668,21 +737,26 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, lng=lng, ages=ages, tournament_types=tournament_types, - national_cup=national_cup + national_cup=national_cup, ) if not first_page_result: logger.error("Failed to get first page results for pagination info") return None - total_results = first_page_result.get('total_results', 0) - first_page_tournaments = first_page_result.get('tournaments', []) + total_results = first_page_result.get("total_results", 0) + first_page_tournaments = first_page_result.get("tournaments", []) results_per_page = len(first_page_tournaments) logger.info(f"Total results: {total_results}, Results per page: {results_per_page}") if total_results == 0: - return {'tournaments': [], 'total_results': 0, 'current_count': 0, 'pages_scraped': 0} + return { + "tournaments": [], + "total_results": 0, + "current_count": 0, + "pages_scraped": 0, + } # Calculate number of pages needed if results_per_page > 0: @@ -694,7 +768,12 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, # If only one page total, return empty since page 0 was already handled if total_pages <= 1: - return {'tournaments': [], 'total_results': total_results, 'current_count': 0, 'pages_scraped': 0} + return { + "tournaments": [], + "total_results": total_results, + "current_count": 0, + "pages_scraped": 0, + } # Scrape all remaining pages concurrently (pages 1 to total_pages-1) all_tournaments = [] @@ -705,8 +784,19 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, for page in range(1, total_pages): future = executor.submit( _scrape_single_page, - sorting_option, page, start_date, end_date, city, distance, - categories, levels, lat, lng, ages, tournament_types, national_cup + sorting_option, + page, + start_date, + end_date, + city, + distance, + categories, + levels, + lat, + lng, + ages, + tournament_types, + national_cup, ) futures.append((page, future)) @@ -714,24 +804,30 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, for page, future in futures: try: result = future.result(timeout=60) # 60 second timeout per page - if result and result.get('tournaments'): - tournaments = result.get('tournaments', []) + if result and result.get("tournaments"): + tournaments = result.get("tournaments", []) all_tournaments.extend(tournaments) - logger.info(f"Page {page} completed: {len(tournaments)} tournaments") + logger.info( + f"Page {page} completed: {len(tournaments)} tournaments" + ) else: logger.warning(f"Page {page} returned no results") except Exception as e: logger.error(f"Error processing page {page}: {e}") - logger.info(f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages-1} remaining pages") + logger.info( + f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages - 1} remaining pages" + ) return { - 'tournaments': all_tournaments, - 'total_results': total_results, - 'current_count': len(all_tournaments), - 'pages_scraped': total_pages - 1 # Excluding page 0 which was handled separately + "tournaments": all_tournaments, + "total_results": total_results, + "current_count": len(all_tournaments), + "pages_scraped": total_pages + - 1, # Excluding page 0 which was handled separately } + def _parse_clubs_ajax_response(json_data): """ Parse the clubs AJAX response to match Swift FederalClubResponse structure @@ -746,19 +842,24 @@ def _parse_clubs_ajax_response(json_data): # Try to extract clubs data from different possible response structures if isinstance(json_data, dict): # Pattern 1: Direct club_markers array - if 'club_markers' in json_data: - clubs_data = json_data['club_markers'] - total_results = json_data.get('nombreResultat', len(clubs_data)) + if "club_markers" in json_data: + clubs_data = json_data["club_markers"] + total_results = json_data.get("nombreResultat", len(clubs_data)) # Pattern 2: Results wrapper - elif 'results' in json_data: - results = json_data['results'] - clubs_data = results.get('clubs', results.get('items', results.get('club_markers', []))) - total_results = results.get('nombreResultat', results.get('total', results.get('nb_results', len(clubs_data)))) + elif "results" in json_data: + results = json_data["results"] + clubs_data = results.get( + "clubs", results.get("items", results.get("club_markers", [])) + ) + total_results = results.get( + "nombreResultat", + results.get("total", results.get("nb_results", len(clubs_data))), + ) # Pattern 3: Direct array in response - elif 'data' in json_data: - clubs_data = json_data['data'] + elif "data" in json_data: + clubs_data = json_data["data"] total_results = len(clubs_data) # Pattern 4: Response is the clubs array directly @@ -780,49 +881,55 @@ def _parse_clubs_ajax_response(json_data): if isinstance(item, dict): # Extract pratiques array pratiques = [] - if 'pratiques' in item: - pratiques = item['pratiques'] - elif 'practices' in item: - pratiques = item['practices'] + if "pratiques" in item: + pratiques = item["pratiques"] + elif "practices" in item: + pratiques = item["practices"] else: # Default to PADEL if not specified pratiques = ["PADEL"] # Ensure pratiques are uppercase strings - pratiques = [p.upper() if isinstance(p, str) else str(p).upper() for p in pratiques] + pratiques = [ + p.upper() if isinstance(p, str) else str(p).upper() + for p in pratiques + ] club_marker = { - "nom": item.get('nom', item.get('name', '')), - "clubId": str(item.get('clubId', item.get('id', item.get('code', '')))), - "ville": item.get('ville', item.get('city', '')), - "distance": str(item.get('distance', '0')), - "terrainPratiqueLibelle": item.get('terrainPratiqueLibelle', item.get('courtsInfo', '')), + "nom": item.get("nom", item.get("name", "")), + "clubId": str( + item.get("clubId", item.get("id", item.get("code", ""))) + ), + "ville": item.get("ville", item.get("city", "")), + "distance": str(item.get("distance", "0")), + "terrainPratiqueLibelle": item.get( + "terrainPratiqueLibelle", item.get("courtsInfo", "") + ), "pratiques": pratiques, - "lat": float(item.get('lat', item.get('latitude', 0.0))), - "lng": float(item.get('lng', item.get('longitude', 0.0))) + "lat": float(item.get("lat", item.get("latitude", 0.0))), + "lng": float(item.get("lng", item.get("longitude", 0.0))), } club_markers.append(club_marker) - logger.info(f"Successfully parsed {len(club_markers)} club markers from response") + logger.info( + f"Successfully parsed {len(club_markers)} club markers from response" + ) # Return the response in the format expected by Swift FederalClubResponse return { "typeRecherche": "clubs", "nombreResultat": total_results, - "club_markers": club_markers + "club_markers": club_markers, } except Exception as e: logger.error(f"Error parsing clubs AJAX response: {e}") - return { - "typeRecherche": "clubs", - "nombreResultat": 0, - "club_markers": [] - } + return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} -def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, - radius=15, max_workers=5): +def scrape_federal_clubs( + country=None, city="", latitude=None, longitude=None, radius=15, max_workers=5 +): """ Scrapes FFT federal clubs by extracting data from the HTML response """ @@ -833,16 +940,19 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, browser = p.chromium.launch(headless=True) page_obj = browser.new_page() - page_obj.set_extra_http_headers({ - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" - }) + page_obj.set_extra_http_headers( + { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" + } + ) # Clean up city name - remove zip code and extra info clean_city = city if city: import re - clean_city = re.sub(r'[,\s]*\d{5}.*$', '', city).strip() - clean_city = clean_city.rstrip(',').strip() + + clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip() + clean_city = clean_city.rstrip(",").strip() logger.info(f"Cleaned city name: '{city}' -> '{clean_city}'") @@ -856,7 +966,7 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, page_obj.goto(results_url) # Wait for the page to load - page_obj.wait_for_timeout(3000) + page_obj.wait_for_timeout(7000) # Check if we're in queue if "queue-it.net" in page_obj.url.lower(): @@ -865,7 +975,7 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, return { "typeRecherche": "clubs", "nombreResultat": 0, - "club_markers": [] + "club_markers": [], } # Use JavaScript to extract the data directly from the page @@ -904,45 +1014,45 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, browser.close() - if result.get('success'): - type_recherche = result.get('typeRecherche', 'club') - total = result.get('total', 0) - resultat = result.get('resultat', []) + if result.get("success"): + type_recherche = result.get("typeRecherche", "club") + total = result.get("total", 0) + resultat = result.get("resultat", []) logger.info(f"Successfully extracted {total} clubs") # Convert resultat to club_markers format club_markers = [] for club in resultat: - club_markers.append({ - "nom": club.get('nom', ''), - "clubId": club.get('clubId', ''), - "ville": club.get('ville', ''), - "distance": club.get('distance', ''), - "terrainPratiqueLibelle": club.get('terrainPratiqueLibelle', ''), - "pratiques": club.get('pratiques', []), - "lat": club.get('lat', 0.0), - "lng": club.get('lng', 0.0) - }) + club_markers.append( + { + "nom": club.get("nom", ""), + "clubId": club.get("clubId", ""), + "ville": club.get("ville", ""), + "distance": club.get("distance", ""), + "terrainPratiqueLibelle": club.get( + "terrainPratiqueLibelle", "" + ), + "pratiques": club.get("pratiques", []), + "lat": club.get("lat", 0.0), + "lng": club.get("lng", 0.0), + } + ) return { "typeRecherche": type_recherche, "nombreResultat": total, - "club_markers": club_markers + "club_markers": club_markers, } else: logger.error(f"Failed to extract data: {result.get('error')}") return { "typeRecherche": "clubs", "nombreResultat": 0, - "club_markers": [] + "club_markers": [], } except Exception as e: logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Traceback: {traceback.format_exc()}") - return { - "typeRecherche": "clubs", - "nombreResultat": 0, - "club_markers": [] - } + return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}