add waiting fft scraping

mailing
Razmig Sarkissian 2 months ago
parent 33f170dd3e
commit 1269b97765
  1. 484
      api/utils.py

@ -9,19 +9,23 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def check_version_smaller_than_1_1_12(version_str): def check_version_smaller_than_1_1_12(version_str):
# Remove the parentheses part if it exists, example of version: 1.1.12 (2) # Remove the parentheses part if it exists, example of version: 1.1.12 (2)
version_str = version_str.split()[0] version_str = version_str.split()[0]
if version_str: if version_str:
# Split version into components # Split version into components
version_parts = [int(x) for x in version_str.split('.')] version_parts = [int(x) for x in version_str.split(".")]
target_parts = [1, 1, 12] target_parts = [1, 1, 12]
# Compare version components # Compare version components
return version_parts < target_parts return version_parts < target_parts
else: else:
return False return False
def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date=None, page=0):
def scrape_fft_club_tournaments(
club_code, club_name, start_date=None, end_date=None, page=0
):
""" """
Scrapes FFT tournaments using Playwright with detailed debugging Scrapes FFT tournaments using Playwright with detailed debugging
""" """
@ -31,16 +35,18 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date=
browser = p.chromium.launch(headless=True) browser = p.chromium.launch(headless=True)
page_obj = browser.new_page() page_obj = browser.new_page()
page_obj.set_extra_http_headers({ page_obj.set_extra_http_headers(
{
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
}) }
)
# Navigate to FFT # Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois" target_url = "https://tenup.fft.fr/recherche/tournois"
logger.info(f"Navigating to: {target_url}") logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url) page_obj.goto(target_url)
# page_obj.wait_for_timeout(7000) page_obj.wait_for_timeout(7000)
current_url = page_obj.url current_url = page_obj.url
logger.info(f"Current URL: {current_url}") logger.info(f"Current URL: {current_url}")
@ -57,7 +63,7 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date=
browser.close() browser.close()
return None return None
form_build_id = form_input.get_attribute('value') form_build_id = form_input.get_attribute("value")
logger.info(f"Extracted form_build_id: {form_build_id}") logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters # Build parameters
@ -121,8 +127,8 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date=
logger.info(f"AJAX Response Status: {result.get('status')}") logger.info(f"AJAX Response Status: {result.get('status')}")
logger.info(f"AJAX Response Success: {result.get('success')}") logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get('success'): if result.get("success"):
response_text = result.get('responseText', '') response_text = result.get("responseText", "")
logger.info(f"Raw Response Length: {len(response_text)}") logger.info(f"Raw Response Length: {len(response_text)}")
# logger.info(f"Raw Response (first 500 chars): {response_text[:500]}") # logger.info(f"Raw Response (first 500 chars): {response_text[:500]}")
@ -151,7 +157,10 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date=
logger.error(f"Traceback: {traceback.format_exc()}") logger.error(f"Traceback: {traceback.format_exc()}")
return None return None
def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, end_date=None):
def scrape_fft_club_tournaments_all_pages(
club_code, club_name, start_date=None, end_date=None
):
""" """
Scrapes all pages of FFT tournaments for a specific club Scrapes all pages of FFT tournaments for a specific club
""" """
@ -168,7 +177,7 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None,
club_name=club_name, club_name=club_name,
start_date=start_date, start_date=start_date,
end_date=end_date, end_date=end_date,
page=page page=page,
) )
# Debug: Log what we got # Debug: Log what we got
@ -178,7 +187,7 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None,
logger.warning(f"No result for page {page}") logger.warning(f"No result for page {page}")
break break
tournaments = result.get('tournaments', []) tournaments = result.get("tournaments", [])
logger.info(f"Page {page} returned {len(tournaments)} tournaments") logger.info(f"Page {page} returned {len(tournaments)} tournaments")
if not tournaments: if not tournaments:
@ -188,8 +197,10 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None,
all_tournaments.extend(tournaments) all_tournaments.extend(tournaments)
# Check if we have all results # Check if we have all results
total_results = result.get('total_results', 0) total_results = result.get("total_results", 0)
logger.info(f"Total so far: {len(all_tournaments)}, Target: {total_results}") logger.info(
f"Total so far: {len(all_tournaments)}, Target: {total_results}"
)
if len(all_tournaments) >= total_results: if len(all_tournaments) >= total_results:
logger.info("Got all tournaments, stopping") logger.info("Got all tournaments, stopping")
@ -203,15 +214,18 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None,
logger.error(f"Error on page {page}: {e}") logger.error(f"Error on page {page}: {e}")
break break
logger.info(f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages") logger.info(
f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages"
)
return { return {
'tournaments': all_tournaments, "tournaments": all_tournaments,
'total_results': len(all_tournaments), "total_results": len(all_tournaments),
'current_count': len(all_tournaments), "current_count": len(all_tournaments),
'pages_scraped': page + 1 "pages_scraped": page + 1,
} }
def _parse_ajax_response(commands): def _parse_ajax_response(commands):
""" """
Parse the AJAX response commands to extract tournament data Parse the AJAX response commands to extract tournament data
@ -222,114 +236,133 @@ def _parse_ajax_response(commands):
try: try:
# Check for alert commands (maintenance mode) # Check for alert commands (maintenance mode)
for command in commands: for command in commands:
if command.get('command') == 'alert': if command.get("command") == "alert":
logger.warning("Maintenance mode detected") logger.warning("Maintenance mode detected")
return None return None
# Find the command with results # Find the command with results
result_command = None result_command = None
for command in commands: for command in commands:
if command.get('command') == 'recherche_tournois_update': if command.get("command") == "recherche_tournois_update":
result_command = command result_command = command
logger.info("Found recherche_tournois_update command!") logger.info("Found recherche_tournois_update command!")
break break
if result_command and result_command.get('results'): if result_command and result_command.get("results"):
results = result_command['results'] results = result_command["results"]
items = results.get('items', []) items = results.get("items", [])
total_results = results.get('nb_results', 0) total_results = results.get("nb_results", 0)
logger.info(f"Processing {len(items)} tournaments from results") logger.info(f"Processing {len(items)} tournaments from results")
for item in items: for item in items:
# Parse dates - they're already in the correct format # Parse dates - they're already in the correct format
date_debut = item.get('dateDebut') date_debut = item.get("dateDebut")
date_fin = item.get('dateFin') date_fin = item.get("dateFin")
date_validation = item.get('dateValidation') date_validation = item.get("dateValidation")
# Build the tournament object to match Swift FederalTournament structure # Build the tournament object to match Swift FederalTournament structure
tournament = { tournament = {
"id": str(item.get('id', '')), "id": str(item.get("id", "")),
"millesime": item.get('millesime'), "millesime": item.get("millesime"),
"libelle": item.get('libelle'), "libelle": item.get("libelle"),
"tmc": item.get('tmc'), "tmc": item.get("tmc"),
"tarifAdulteChampionnat": item.get('tarifAdulteChampionnat'), "tarifAdulteChampionnat": item.get("tarifAdulteChampionnat"),
"type": item.get('type'), "type": item.get("type"),
"ageReel": item.get('ageReel'), "ageReel": item.get("ageReel"),
"naturesTerrains": item.get('naturesTerrains', []), "naturesTerrains": item.get("naturesTerrains", []),
"idsArbitres": item.get('idsArbitres', []), "idsArbitres": item.get("idsArbitres", []),
"tarifJeuneChampionnat": item.get('tarifJeuneChampionnat'), "tarifJeuneChampionnat": item.get("tarifJeuneChampionnat"),
"international": item.get('international'), "international": item.get("international"),
"inscriptionEnLigne": item.get('inscriptionEnLigne'), "inscriptionEnLigne": item.get("inscriptionEnLigne"),
"categorieTournoi": item.get('categorieTournoi'), "categorieTournoi": item.get("categorieTournoi"),
"prixLot": item.get('prixLot'), "prixLot": item.get("prixLot"),
"paiementEnLigne": item.get('paiementEnLigne'), "paiementEnLigne": item.get("paiementEnLigne"),
"reductionAdherentJeune": item.get('reductionAdherentJeune'), "reductionAdherentJeune": item.get("reductionAdherentJeune"),
"reductionAdherentAdulte": item.get('reductionAdherentAdulte'), "reductionAdherentAdulte": item.get("reductionAdherentAdulte"),
"paiementEnLigneObligatoire": item.get('paiementEnLigneObligatoire'), "paiementEnLigneObligatoire": item.get(
"villeEngagement": item.get('villeEngagement'), "paiementEnLigneObligatoire"
"senior": item.get('senior'), ),
"veteran": item.get('veteran'), "villeEngagement": item.get("villeEngagement"),
"inscriptionEnLigneEnCours": item.get('inscriptionEnLigneEnCours'), "senior": item.get("senior"),
"avecResultatPublie": item.get('avecResultatPublie'), "veteran": item.get("veteran"),
"code": item.get('code'), "inscriptionEnLigneEnCours": item.get("inscriptionEnLigneEnCours"),
"categorieAge": item.get('categorieAge'), "avecResultatPublie": item.get("avecResultatPublie"),
"codeComite": item.get('codeComite'), "code": item.get("code"),
"installations": item.get('installations', []), "categorieAge": item.get("categorieAge"),
"reductionEpreuveSupplementaireJeune": item.get('reductionEpreuveSupplementaireJeune'), "codeComite": item.get("codeComite"),
"reductionEpreuveSupplementaireAdulte": item.get('reductionEpreuveSupplementaireAdulte'), "installations": item.get("installations", []),
"nomComite": item.get('nomComite'), "reductionEpreuveSupplementaireJeune": item.get(
"naturesEpreuves": item.get('naturesEpreuves'), "reductionEpreuveSupplementaireJeune"
"jeune": item.get('jeune'), ),
"courrielEngagement": item.get('courrielEngagement'), "reductionEpreuveSupplementaireAdulte": item.get(
"nomClub": item.get('nomClub'), "reductionEpreuveSupplementaireAdulte"
"installation": item.get('installation'), ),
"categorieAgeMax": item.get('categorieAgeMax'), "nomComite": item.get("nomComite"),
"tournoiInterne": item.get('tournoiInterne'), "naturesEpreuves": item.get("naturesEpreuves"),
"nomLigue": item.get('nomLigue'), "jeune": item.get("jeune"),
"nomEngagement": item.get('nomEngagement'), "courrielEngagement": item.get("courrielEngagement"),
"codeLigue": item.get('codeLigue'), "nomClub": item.get("nomClub"),
"modeleDeBalle": item.get('modeleDeBalle'), "installation": item.get("installation"),
"jugeArbitre": item.get('jugeArbitre'), "categorieAgeMax": item.get("categorieAgeMax"),
"adresse2Engagement": item.get('adresse2Engagement'), "tournoiInterne": item.get("tournoiInterne"),
"epreuves": item.get('epreuves'), "nomLigue": item.get("nomLigue"),
"nomEngagement": item.get("nomEngagement"),
"codeLigue": item.get("codeLigue"),
"modeleDeBalle": item.get("modeleDeBalle"),
"jugeArbitre": item.get("jugeArbitre"),
"adresse2Engagement": item.get("adresse2Engagement"),
"epreuves": item.get("epreuves"),
"dateDebut": date_debut, "dateDebut": date_debut,
"serie": item.get('serie'), "serie": item.get("serie"),
"dateFin": date_fin, "dateFin": date_fin,
"dateValidation": date_validation, "dateValidation": date_validation,
"codePostalEngagement": item.get('codePostalEngagement'), "codePostalEngagement": item.get("codePostalEngagement"),
"codeClub": item.get('codeClub'), "codeClub": item.get("codeClub"),
"prixEspece": item.get('prixEspece'), "prixEspece": item.get("prixEspece"),
"japPhoneNumber": None, # Will be populated by separate umpire call "japPhoneNumber": None, # Will be populated by separate umpire call
# Additional fields from the response # Additional fields from the response
"adresse1Engagement": item.get('adresse1Engagement'), "adresse1Engagement": item.get("adresse1Engagement"),
"originalId": item.get('originalId'), "originalId": item.get("originalId"),
"familleTournoi": item.get('familleTournoi', []), "familleTournoi": item.get("familleTournoi", []),
"isTournoi": item.get('isTournoi'), "isTournoi": item.get("isTournoi"),
"natureWithCatAge": item.get('natureWithCatAge') "natureWithCatAge": item.get("natureWithCatAge"),
} }
tournaments.append(tournament) tournaments.append(tournament)
logger.info(f"Successfully parsed {len(tournaments)} tournaments from response") logger.info(
f"Successfully parsed {len(tournaments)} tournaments from response"
)
return { return {
'tournaments': tournaments, "tournaments": tournaments,
'total_results': total_results, "total_results": total_results,
'current_count': len(tournaments) "current_count": len(tournaments),
} }
else: else:
logger.error("No recherche_tournois_update command found in AJAX response") logger.error("No recherche_tournois_update command found in AJAX response")
return {'tournaments': [], 'total_results': 0, 'current_count': 0} return {"tournaments": [], "total_results": 0, "current_count": 0}
except Exception as e: except Exception as e:
logger.error(f"Error parsing AJAX response: {e}") logger.error(f"Error parsing AJAX response: {e}")
return None return None
def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end_date=None,
city='', distance=15, categories=None, levels=None, def scrape_fft_all_tournaments(
lat=None, lng=None, ages=None, tournament_types=None, sorting_option=None,
national_cup=False): page=0,
start_date=None,
end_date=None,
city="",
distance=15,
categories=None,
levels=None,
lat=None,
lng=None,
ages=None,
tournament_types=None,
national_cup=False,
):
""" """
Scrapes FFT tournaments using Playwright with detailed debugging Scrapes FFT tournaments using Playwright with detailed debugging
Based exactly on the working scrape_fft_club_tournaments function Based exactly on the working scrape_fft_club_tournaments function
@ -341,16 +374,18 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end
browser = p.chromium.launch(headless=True) browser = p.chromium.launch(headless=True)
page_obj = browser.new_page() page_obj = browser.new_page()
page_obj.set_extra_http_headers({ page_obj.set_extra_http_headers(
{
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
}) }
)
# Navigate to FFT # Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois" target_url = "https://tenup.fft.fr/recherche/tournois"
logger.info(f"Navigating to: {target_url}") logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url) page_obj.goto(target_url)
# page_obj.wait_for_timeout(7000) page_obj.wait_for_timeout(7000)
current_url = page_obj.url current_url = page_obj.url
logger.info(f"Current URL: {current_url}") logger.info(f"Current URL: {current_url}")
@ -367,7 +402,7 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end
browser.close() browser.close()
return None return None
form_build_id = form_input.get_attribute('value') form_build_id = form_input.get_attribute("value")
logger.info(f"Extracted form_build_id: {form_build_id}") logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters - EXACT same pattern as club function # Build parameters - EXACT same pattern as club function
@ -486,8 +521,8 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end
logger.info(f"AJAX Response Status: {result.get('status')}") logger.info(f"AJAX Response Status: {result.get('status')}")
logger.info(f"AJAX Response Success: {result.get('success')}") logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get('success'): if result.get("success"):
response_text = result.get('responseText', '') response_text = result.get("responseText", "")
logger.info(f"Raw Response Length: {len(response_text)}") logger.info(f"Raw Response Length: {len(response_text)}")
try: try:
@ -525,13 +560,13 @@ def get_umpire_data(tournament_id):
browser = p.chromium.launch( browser = p.chromium.launch(
headless=True, headless=True,
args=[ args=[
'--no-sandbox', "--no-sandbox",
'--disable-dev-shm-usage', "--disable-dev-shm-usage",
'--disable-images', # Don't load images "--disable-images", # Don't load images
'--disable-javascript', # Disable JS for faster loading "--disable-javascript", # Disable JS for faster loading
'--disable-plugins', "--disable-plugins",
'--disable-extensions' "--disable-extensions",
] ],
) )
page = browser.new_page() page = browser.new_page()
@ -553,7 +588,9 @@ def get_umpire_data(tournament_id):
html_content = page.content() html_content = page.content()
# Extract name # Extract name
name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' name_pattern = (
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
)
name_match = re.search(name_pattern, html_content) name_match = re.search(name_pattern, html_content)
name = name_match.group(1).strip() if name_match else None name = name_match.group(1).strip() if name_match else None
@ -569,7 +606,9 @@ def get_umpire_data(tournament_id):
browser.close() browser.close()
logger.info(f"Extracted umpire data: name={name}, email={email}, phone={phone}") logger.info(
f"Extracted umpire data: name={name}, email={email}, phone={phone}"
)
return name, email, phone return name, email, phone
except Exception as page_error: except Exception as page_error:
@ -581,6 +620,7 @@ def get_umpire_data(tournament_id):
logger.error(f"Error in umpire data extraction: {e}") logger.error(f"Error in umpire data extraction: {e}")
return None, None, None return None, None, None
def _get_umpire_data_requests_fallback(tournament_id): def _get_umpire_data_requests_fallback(tournament_id):
""" """
Fallback method using requests (may hit Queue-It) Fallback method using requests (may hit Queue-It)
@ -591,7 +631,7 @@ def _get_umpire_data_requests_fallback(tournament_id):
url = f"https://tenup.fft.fr/tournoi/{tournament_id}" url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15' "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
} }
response = requests.get(url, headers=headers, timeout=30) response = requests.get(url, headers=headers, timeout=30)
@ -603,7 +643,9 @@ def _get_umpire_data_requests_fallback(tournament_id):
html_content = response.text html_content = response.text
# Extract using regex (original method) # Extract using regex (original method)
name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' name_pattern = (
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
)
name_match = re.search(name_pattern, html_content) name_match = re.search(name_pattern, html_content)
name = name_match.group(1).strip() if name_match else None name = name_match.group(1).strip() if name_match else None
@ -615,15 +657,31 @@ def _get_umpire_data_requests_fallback(tournament_id):
phone_match = re.search(phone_pattern, html_content) phone_match = re.search(phone_pattern, html_content)
phone = phone_match.group(1).strip() if phone_match else None phone = phone_match.group(1).strip() if phone_match else None
logger.info(f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}") logger.info(
f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}"
)
return name, email, phone return name, email, phone
except Exception as e: except Exception as e:
logger.error(f"Error getting umpire data with requests: {e}") logger.error(f"Error getting umpire data with requests: {e}")
return None, None, None return None, None, None
def _scrape_single_page(sorting_option, page, start_date, end_date, city, distance,
categories, levels, lat, lng, ages, tournament_types, national_cup): def _scrape_single_page(
sorting_option,
page,
start_date,
end_date,
city,
distance,
categories,
levels,
lat,
lng,
ages,
tournament_types,
national_cup,
):
""" """
Helper function to scrape a single page of tournaments Helper function to scrape a single page of tournaments
""" """
@ -640,14 +698,25 @@ def _scrape_single_page(sorting_option, page, start_date, end_date, city, distan
lng=lng, lng=lng,
ages=ages, ages=ages,
tournament_types=tournament_types, tournament_types=tournament_types,
national_cup=national_cup national_cup=national_cup,
) )
def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, end_date=None, def scrape_fft_all_tournaments_concurrent(
city='', distance=15, categories=None, levels=None, sorting_option=None,
lat=None, lng=None, ages=None, tournament_types=None, start_date=None,
national_cup=False, max_workers=5): end_date=None,
city="",
distance=15,
categories=None,
levels=None,
lat=None,
lng=None,
ages=None,
tournament_types=None,
national_cup=False,
max_workers=5,
):
""" """
Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end) Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end)
This assumes page 0 was already fetched by the client This assumes page 0 was already fetched by the client
@ -668,21 +737,26 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None,
lng=lng, lng=lng,
ages=ages, ages=ages,
tournament_types=tournament_types, tournament_types=tournament_types,
national_cup=national_cup national_cup=national_cup,
) )
if not first_page_result: if not first_page_result:
logger.error("Failed to get first page results for pagination info") logger.error("Failed to get first page results for pagination info")
return None return None
total_results = first_page_result.get('total_results', 0) total_results = first_page_result.get("total_results", 0)
first_page_tournaments = first_page_result.get('tournaments', []) first_page_tournaments = first_page_result.get("tournaments", [])
results_per_page = len(first_page_tournaments) results_per_page = len(first_page_tournaments)
logger.info(f"Total results: {total_results}, Results per page: {results_per_page}") logger.info(f"Total results: {total_results}, Results per page: {results_per_page}")
if total_results == 0: if total_results == 0:
return {'tournaments': [], 'total_results': 0, 'current_count': 0, 'pages_scraped': 0} return {
"tournaments": [],
"total_results": 0,
"current_count": 0,
"pages_scraped": 0,
}
# Calculate number of pages needed # Calculate number of pages needed
if results_per_page > 0: if results_per_page > 0:
@ -694,7 +768,12 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None,
# If only one page total, return empty since page 0 was already handled # If only one page total, return empty since page 0 was already handled
if total_pages <= 1: if total_pages <= 1:
return {'tournaments': [], 'total_results': total_results, 'current_count': 0, 'pages_scraped': 0} return {
"tournaments": [],
"total_results": total_results,
"current_count": 0,
"pages_scraped": 0,
}
# Scrape all remaining pages concurrently (pages 1 to total_pages-1) # Scrape all remaining pages concurrently (pages 1 to total_pages-1)
all_tournaments = [] all_tournaments = []
@ -705,8 +784,19 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None,
for page in range(1, total_pages): for page in range(1, total_pages):
future = executor.submit( future = executor.submit(
_scrape_single_page, _scrape_single_page,
sorting_option, page, start_date, end_date, city, distance, sorting_option,
categories, levels, lat, lng, ages, tournament_types, national_cup page,
start_date,
end_date,
city,
distance,
categories,
levels,
lat,
lng,
ages,
tournament_types,
national_cup,
) )
futures.append((page, future)) futures.append((page, future))
@ -714,24 +804,30 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None,
for page, future in futures: for page, future in futures:
try: try:
result = future.result(timeout=60) # 60 second timeout per page result = future.result(timeout=60) # 60 second timeout per page
if result and result.get('tournaments'): if result and result.get("tournaments"):
tournaments = result.get('tournaments', []) tournaments = result.get("tournaments", [])
all_tournaments.extend(tournaments) all_tournaments.extend(tournaments)
logger.info(f"Page {page} completed: {len(tournaments)} tournaments") logger.info(
f"Page {page} completed: {len(tournaments)} tournaments"
)
else: else:
logger.warning(f"Page {page} returned no results") logger.warning(f"Page {page} returned no results")
except Exception as e: except Exception as e:
logger.error(f"Error processing page {page}: {e}") logger.error(f"Error processing page {page}: {e}")
logger.info(f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages-1} remaining pages") logger.info(
f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages - 1} remaining pages"
)
return { return {
'tournaments': all_tournaments, "tournaments": all_tournaments,
'total_results': total_results, "total_results": total_results,
'current_count': len(all_tournaments), "current_count": len(all_tournaments),
'pages_scraped': total_pages - 1 # Excluding page 0 which was handled separately "pages_scraped": total_pages
- 1, # Excluding page 0 which was handled separately
} }
def _parse_clubs_ajax_response(json_data): def _parse_clubs_ajax_response(json_data):
""" """
Parse the clubs AJAX response to match Swift FederalClubResponse structure Parse the clubs AJAX response to match Swift FederalClubResponse structure
@ -746,19 +842,24 @@ def _parse_clubs_ajax_response(json_data):
# Try to extract clubs data from different possible response structures # Try to extract clubs data from different possible response structures
if isinstance(json_data, dict): if isinstance(json_data, dict):
# Pattern 1: Direct club_markers array # Pattern 1: Direct club_markers array
if 'club_markers' in json_data: if "club_markers" in json_data:
clubs_data = json_data['club_markers'] clubs_data = json_data["club_markers"]
total_results = json_data.get('nombreResultat', len(clubs_data)) total_results = json_data.get("nombreResultat", len(clubs_data))
# Pattern 2: Results wrapper # Pattern 2: Results wrapper
elif 'results' in json_data: elif "results" in json_data:
results = json_data['results'] results = json_data["results"]
clubs_data = results.get('clubs', results.get('items', results.get('club_markers', []))) clubs_data = results.get(
total_results = results.get('nombreResultat', results.get('total', results.get('nb_results', len(clubs_data)))) "clubs", results.get("items", results.get("club_markers", []))
)
total_results = results.get(
"nombreResultat",
results.get("total", results.get("nb_results", len(clubs_data))),
)
# Pattern 3: Direct array in response # Pattern 3: Direct array in response
elif 'data' in json_data: elif "data" in json_data:
clubs_data = json_data['data'] clubs_data = json_data["data"]
total_results = len(clubs_data) total_results = len(clubs_data)
# Pattern 4: Response is the clubs array directly # Pattern 4: Response is the clubs array directly
@ -780,49 +881,55 @@ def _parse_clubs_ajax_response(json_data):
if isinstance(item, dict): if isinstance(item, dict):
# Extract pratiques array # Extract pratiques array
pratiques = [] pratiques = []
if 'pratiques' in item: if "pratiques" in item:
pratiques = item['pratiques'] pratiques = item["pratiques"]
elif 'practices' in item: elif "practices" in item:
pratiques = item['practices'] pratiques = item["practices"]
else: else:
# Default to PADEL if not specified # Default to PADEL if not specified
pratiques = ["PADEL"] pratiques = ["PADEL"]
# Ensure pratiques are uppercase strings # Ensure pratiques are uppercase strings
pratiques = [p.upper() if isinstance(p, str) else str(p).upper() for p in pratiques] pratiques = [
p.upper() if isinstance(p, str) else str(p).upper()
for p in pratiques
]
club_marker = { club_marker = {
"nom": item.get('nom', item.get('name', '')), "nom": item.get("nom", item.get("name", "")),
"clubId": str(item.get('clubId', item.get('id', item.get('code', '')))), "clubId": str(
"ville": item.get('ville', item.get('city', '')), item.get("clubId", item.get("id", item.get("code", "")))
"distance": str(item.get('distance', '0')), ),
"terrainPratiqueLibelle": item.get('terrainPratiqueLibelle', item.get('courtsInfo', '')), "ville": item.get("ville", item.get("city", "")),
"distance": str(item.get("distance", "0")),
"terrainPratiqueLibelle": item.get(
"terrainPratiqueLibelle", item.get("courtsInfo", "")
),
"pratiques": pratiques, "pratiques": pratiques,
"lat": float(item.get('lat', item.get('latitude', 0.0))), "lat": float(item.get("lat", item.get("latitude", 0.0))),
"lng": float(item.get('lng', item.get('longitude', 0.0))) "lng": float(item.get("lng", item.get("longitude", 0.0))),
} }
club_markers.append(club_marker) club_markers.append(club_marker)
logger.info(f"Successfully parsed {len(club_markers)} club markers from response") logger.info(
f"Successfully parsed {len(club_markers)} club markers from response"
)
# Return the response in the format expected by Swift FederalClubResponse # Return the response in the format expected by Swift FederalClubResponse
return { return {
"typeRecherche": "clubs", "typeRecherche": "clubs",
"nombreResultat": total_results, "nombreResultat": total_results,
"club_markers": club_markers "club_markers": club_markers,
} }
except Exception as e: except Exception as e:
logger.error(f"Error parsing clubs AJAX response: {e}") logger.error(f"Error parsing clubs AJAX response: {e}")
return { return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
"typeRecherche": "clubs",
"nombreResultat": 0,
"club_markers": []
}
def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, def scrape_federal_clubs(
radius=15, max_workers=5): country=None, city="", latitude=None, longitude=None, radius=15, max_workers=5
):
""" """
Scrapes FFT federal clubs by extracting data from the HTML response Scrapes FFT federal clubs by extracting data from the HTML response
""" """
@ -833,16 +940,19 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None,
browser = p.chromium.launch(headless=True) browser = p.chromium.launch(headless=True)
page_obj = browser.new_page() page_obj = browser.new_page()
page_obj.set_extra_http_headers({ page_obj.set_extra_http_headers(
{
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
}) }
)
# Clean up city name - remove zip code and extra info # Clean up city name - remove zip code and extra info
clean_city = city clean_city = city
if city: if city:
import re import re
clean_city = re.sub(r'[,\s]*\d{5}.*$', '', city).strip()
clean_city = clean_city.rstrip(',').strip() clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip()
clean_city = clean_city.rstrip(",").strip()
logger.info(f"Cleaned city name: '{city}' -> '{clean_city}'") logger.info(f"Cleaned city name: '{city}' -> '{clean_city}'")
@ -856,7 +966,7 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None,
page_obj.goto(results_url) page_obj.goto(results_url)
# Wait for the page to load # Wait for the page to load
page_obj.wait_for_timeout(3000) page_obj.wait_for_timeout(7000)
# Check if we're in queue # Check if we're in queue
if "queue-it.net" in page_obj.url.lower(): if "queue-it.net" in page_obj.url.lower():
@ -865,7 +975,7 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None,
return { return {
"typeRecherche": "clubs", "typeRecherche": "clubs",
"nombreResultat": 0, "nombreResultat": 0,
"club_markers": [] "club_markers": [],
} }
# Use JavaScript to extract the data directly from the page # Use JavaScript to extract the data directly from the page
@ -904,45 +1014,45 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None,
browser.close() browser.close()
if result.get('success'): if result.get("success"):
type_recherche = result.get('typeRecherche', 'club') type_recherche = result.get("typeRecherche", "club")
total = result.get('total', 0) total = result.get("total", 0)
resultat = result.get('resultat', []) resultat = result.get("resultat", [])
logger.info(f"Successfully extracted {total} clubs") logger.info(f"Successfully extracted {total} clubs")
# Convert resultat to club_markers format # Convert resultat to club_markers format
club_markers = [] club_markers = []
for club in resultat: for club in resultat:
club_markers.append({ club_markers.append(
"nom": club.get('nom', ''), {
"clubId": club.get('clubId', ''), "nom": club.get("nom", ""),
"ville": club.get('ville', ''), "clubId": club.get("clubId", ""),
"distance": club.get('distance', ''), "ville": club.get("ville", ""),
"terrainPratiqueLibelle": club.get('terrainPratiqueLibelle', ''), "distance": club.get("distance", ""),
"pratiques": club.get('pratiques', []), "terrainPratiqueLibelle": club.get(
"lat": club.get('lat', 0.0), "terrainPratiqueLibelle", ""
"lng": club.get('lng', 0.0) ),
}) "pratiques": club.get("pratiques", []),
"lat": club.get("lat", 0.0),
"lng": club.get("lng", 0.0),
}
)
return { return {
"typeRecherche": type_recherche, "typeRecherche": type_recherche,
"nombreResultat": total, "nombreResultat": total,
"club_markers": club_markers "club_markers": club_markers,
} }
else: else:
logger.error(f"Failed to extract data: {result.get('error')}") logger.error(f"Failed to extract data: {result.get('error')}")
return { return {
"typeRecherche": "clubs", "typeRecherche": "clubs",
"nombreResultat": 0, "nombreResultat": 0,
"club_markers": [] "club_markers": [],
} }
except Exception as e: except Exception as e:
logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Error in federal clubs scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}") logger.error(f"Traceback: {traceback.format_exc()}")
return { return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
"typeRecherche": "clubs",
"nombreResultat": 0,
"club_markers": []
}

Loading…
Cancel
Save