|
|
|
|
@ -9,19 +9,23 @@ from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_version_smaller_than_1_1_12(version_str): |
|
|
|
|
# Remove the parentheses part if it exists, example of version: 1.1.12 (2) |
|
|
|
|
version_str = version_str.split()[0] |
|
|
|
|
if version_str: |
|
|
|
|
# Split version into components |
|
|
|
|
version_parts = [int(x) for x in version_str.split('.')] |
|
|
|
|
version_parts = [int(x) for x in version_str.split(".")] |
|
|
|
|
target_parts = [1, 1, 12] |
|
|
|
|
# Compare version components |
|
|
|
|
return version_parts < target_parts |
|
|
|
|
else: |
|
|
|
|
return False |
|
|
|
|
|
|
|
|
|
def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date=None, page=0): |
|
|
|
|
|
|
|
|
|
def scrape_fft_club_tournaments( |
|
|
|
|
club_code, club_name, start_date=None, end_date=None, page=0 |
|
|
|
|
): |
|
|
|
|
""" |
|
|
|
|
Scrapes FFT tournaments using Playwright with detailed debugging |
|
|
|
|
""" |
|
|
|
|
@ -31,16 +35,18 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= |
|
|
|
|
browser = p.chromium.launch(headless=True) |
|
|
|
|
page_obj = browser.new_page() |
|
|
|
|
|
|
|
|
|
page_obj.set_extra_http_headers({ |
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" |
|
|
|
|
}) |
|
|
|
|
page_obj.set_extra_http_headers( |
|
|
|
|
{ |
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" |
|
|
|
|
} |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Navigate to FFT |
|
|
|
|
target_url = "https://tenup.fft.fr/recherche/tournois" |
|
|
|
|
logger.info(f"Navigating to: {target_url}") |
|
|
|
|
|
|
|
|
|
page_obj.goto(target_url) |
|
|
|
|
# page_obj.wait_for_timeout(7000) |
|
|
|
|
page_obj.wait_for_timeout(7000) |
|
|
|
|
|
|
|
|
|
current_url = page_obj.url |
|
|
|
|
logger.info(f"Current URL: {current_url}") |
|
|
|
|
@ -57,7 +63,7 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= |
|
|
|
|
browser.close() |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
form_build_id = form_input.get_attribute('value') |
|
|
|
|
form_build_id = form_input.get_attribute("value") |
|
|
|
|
logger.info(f"Extracted form_build_id: {form_build_id}") |
|
|
|
|
|
|
|
|
|
# Build parameters |
|
|
|
|
@ -121,8 +127,8 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= |
|
|
|
|
logger.info(f"AJAX Response Status: {result.get('status')}") |
|
|
|
|
logger.info(f"AJAX Response Success: {result.get('success')}") |
|
|
|
|
|
|
|
|
|
if result.get('success'): |
|
|
|
|
response_text = result.get('responseText', '') |
|
|
|
|
if result.get("success"): |
|
|
|
|
response_text = result.get("responseText", "") |
|
|
|
|
logger.info(f"Raw Response Length: {len(response_text)}") |
|
|
|
|
# logger.info(f"Raw Response (first 500 chars): {response_text[:500]}") |
|
|
|
|
|
|
|
|
|
@ -151,7 +157,10 @@ def scrape_fft_club_tournaments(club_code, club_name, start_date=None, end_date= |
|
|
|
|
logger.error(f"Traceback: {traceback.format_exc()}") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, end_date=None): |
|
|
|
|
|
|
|
|
|
def scrape_fft_club_tournaments_all_pages( |
|
|
|
|
club_code, club_name, start_date=None, end_date=None |
|
|
|
|
): |
|
|
|
|
""" |
|
|
|
|
Scrapes all pages of FFT tournaments for a specific club |
|
|
|
|
""" |
|
|
|
|
@ -168,7 +177,7 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, |
|
|
|
|
club_name=club_name, |
|
|
|
|
start_date=start_date, |
|
|
|
|
end_date=end_date, |
|
|
|
|
page=page |
|
|
|
|
page=page, |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Debug: Log what we got |
|
|
|
|
@ -178,7 +187,7 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, |
|
|
|
|
logger.warning(f"No result for page {page}") |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
tournaments = result.get('tournaments', []) |
|
|
|
|
tournaments = result.get("tournaments", []) |
|
|
|
|
logger.info(f"Page {page} returned {len(tournaments)} tournaments") |
|
|
|
|
|
|
|
|
|
if not tournaments: |
|
|
|
|
@ -188,8 +197,10 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, |
|
|
|
|
all_tournaments.extend(tournaments) |
|
|
|
|
|
|
|
|
|
# Check if we have all results |
|
|
|
|
total_results = result.get('total_results', 0) |
|
|
|
|
logger.info(f"Total so far: {len(all_tournaments)}, Target: {total_results}") |
|
|
|
|
total_results = result.get("total_results", 0) |
|
|
|
|
logger.info( |
|
|
|
|
f"Total so far: {len(all_tournaments)}, Target: {total_results}" |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
if len(all_tournaments) >= total_results: |
|
|
|
|
logger.info("Got all tournaments, stopping") |
|
|
|
|
@ -203,15 +214,18 @@ def scrape_fft_club_tournaments_all_pages(club_code, club_name, start_date=None, |
|
|
|
|
logger.error(f"Error on page {page}: {e}") |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
logger.info(f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages") |
|
|
|
|
logger.info( |
|
|
|
|
f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages" |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
'tournaments': all_tournaments, |
|
|
|
|
'total_results': len(all_tournaments), |
|
|
|
|
'current_count': len(all_tournaments), |
|
|
|
|
'pages_scraped': page + 1 |
|
|
|
|
"tournaments": all_tournaments, |
|
|
|
|
"total_results": len(all_tournaments), |
|
|
|
|
"current_count": len(all_tournaments), |
|
|
|
|
"pages_scraped": page + 1, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_ajax_response(commands): |
|
|
|
|
""" |
|
|
|
|
Parse the AJAX response commands to extract tournament data |
|
|
|
|
@ -222,114 +236,133 @@ def _parse_ajax_response(commands): |
|
|
|
|
try: |
|
|
|
|
# Check for alert commands (maintenance mode) |
|
|
|
|
for command in commands: |
|
|
|
|
if command.get('command') == 'alert': |
|
|
|
|
if command.get("command") == "alert": |
|
|
|
|
logger.warning("Maintenance mode detected") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
# Find the command with results |
|
|
|
|
result_command = None |
|
|
|
|
for command in commands: |
|
|
|
|
if command.get('command') == 'recherche_tournois_update': |
|
|
|
|
if command.get("command") == "recherche_tournois_update": |
|
|
|
|
result_command = command |
|
|
|
|
logger.info("Found recherche_tournois_update command!") |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
if result_command and result_command.get('results'): |
|
|
|
|
results = result_command['results'] |
|
|
|
|
items = results.get('items', []) |
|
|
|
|
total_results = results.get('nb_results', 0) |
|
|
|
|
if result_command and result_command.get("results"): |
|
|
|
|
results = result_command["results"] |
|
|
|
|
items = results.get("items", []) |
|
|
|
|
total_results = results.get("nb_results", 0) |
|
|
|
|
|
|
|
|
|
logger.info(f"Processing {len(items)} tournaments from results") |
|
|
|
|
|
|
|
|
|
for item in items: |
|
|
|
|
# Parse dates - they're already in the correct format |
|
|
|
|
date_debut = item.get('dateDebut') |
|
|
|
|
date_fin = item.get('dateFin') |
|
|
|
|
date_validation = item.get('dateValidation') |
|
|
|
|
date_debut = item.get("dateDebut") |
|
|
|
|
date_fin = item.get("dateFin") |
|
|
|
|
date_validation = item.get("dateValidation") |
|
|
|
|
|
|
|
|
|
# Build the tournament object to match Swift FederalTournament structure |
|
|
|
|
tournament = { |
|
|
|
|
"id": str(item.get('id', '')), |
|
|
|
|
"millesime": item.get('millesime'), |
|
|
|
|
"libelle": item.get('libelle'), |
|
|
|
|
"tmc": item.get('tmc'), |
|
|
|
|
"tarifAdulteChampionnat": item.get('tarifAdulteChampionnat'), |
|
|
|
|
"type": item.get('type'), |
|
|
|
|
"ageReel": item.get('ageReel'), |
|
|
|
|
"naturesTerrains": item.get('naturesTerrains', []), |
|
|
|
|
"idsArbitres": item.get('idsArbitres', []), |
|
|
|
|
"tarifJeuneChampionnat": item.get('tarifJeuneChampionnat'), |
|
|
|
|
"international": item.get('international'), |
|
|
|
|
"inscriptionEnLigne": item.get('inscriptionEnLigne'), |
|
|
|
|
"categorieTournoi": item.get('categorieTournoi'), |
|
|
|
|
"prixLot": item.get('prixLot'), |
|
|
|
|
"paiementEnLigne": item.get('paiementEnLigne'), |
|
|
|
|
"reductionAdherentJeune": item.get('reductionAdherentJeune'), |
|
|
|
|
"reductionAdherentAdulte": item.get('reductionAdherentAdulte'), |
|
|
|
|
"paiementEnLigneObligatoire": item.get('paiementEnLigneObligatoire'), |
|
|
|
|
"villeEngagement": item.get('villeEngagement'), |
|
|
|
|
"senior": item.get('senior'), |
|
|
|
|
"veteran": item.get('veteran'), |
|
|
|
|
"inscriptionEnLigneEnCours": item.get('inscriptionEnLigneEnCours'), |
|
|
|
|
"avecResultatPublie": item.get('avecResultatPublie'), |
|
|
|
|
"code": item.get('code'), |
|
|
|
|
"categorieAge": item.get('categorieAge'), |
|
|
|
|
"codeComite": item.get('codeComite'), |
|
|
|
|
"installations": item.get('installations', []), |
|
|
|
|
"reductionEpreuveSupplementaireJeune": item.get('reductionEpreuveSupplementaireJeune'), |
|
|
|
|
"reductionEpreuveSupplementaireAdulte": item.get('reductionEpreuveSupplementaireAdulte'), |
|
|
|
|
"nomComite": item.get('nomComite'), |
|
|
|
|
"naturesEpreuves": item.get('naturesEpreuves'), |
|
|
|
|
"jeune": item.get('jeune'), |
|
|
|
|
"courrielEngagement": item.get('courrielEngagement'), |
|
|
|
|
"nomClub": item.get('nomClub'), |
|
|
|
|
"installation": item.get('installation'), |
|
|
|
|
"categorieAgeMax": item.get('categorieAgeMax'), |
|
|
|
|
"tournoiInterne": item.get('tournoiInterne'), |
|
|
|
|
"nomLigue": item.get('nomLigue'), |
|
|
|
|
"nomEngagement": item.get('nomEngagement'), |
|
|
|
|
"codeLigue": item.get('codeLigue'), |
|
|
|
|
"modeleDeBalle": item.get('modeleDeBalle'), |
|
|
|
|
"jugeArbitre": item.get('jugeArbitre'), |
|
|
|
|
"adresse2Engagement": item.get('adresse2Engagement'), |
|
|
|
|
"epreuves": item.get('epreuves'), |
|
|
|
|
"id": str(item.get("id", "")), |
|
|
|
|
"millesime": item.get("millesime"), |
|
|
|
|
"libelle": item.get("libelle"), |
|
|
|
|
"tmc": item.get("tmc"), |
|
|
|
|
"tarifAdulteChampionnat": item.get("tarifAdulteChampionnat"), |
|
|
|
|
"type": item.get("type"), |
|
|
|
|
"ageReel": item.get("ageReel"), |
|
|
|
|
"naturesTerrains": item.get("naturesTerrains", []), |
|
|
|
|
"idsArbitres": item.get("idsArbitres", []), |
|
|
|
|
"tarifJeuneChampionnat": item.get("tarifJeuneChampionnat"), |
|
|
|
|
"international": item.get("international"), |
|
|
|
|
"inscriptionEnLigne": item.get("inscriptionEnLigne"), |
|
|
|
|
"categorieTournoi": item.get("categorieTournoi"), |
|
|
|
|
"prixLot": item.get("prixLot"), |
|
|
|
|
"paiementEnLigne": item.get("paiementEnLigne"), |
|
|
|
|
"reductionAdherentJeune": item.get("reductionAdherentJeune"), |
|
|
|
|
"reductionAdherentAdulte": item.get("reductionAdherentAdulte"), |
|
|
|
|
"paiementEnLigneObligatoire": item.get( |
|
|
|
|
"paiementEnLigneObligatoire" |
|
|
|
|
), |
|
|
|
|
"villeEngagement": item.get("villeEngagement"), |
|
|
|
|
"senior": item.get("senior"), |
|
|
|
|
"veteran": item.get("veteran"), |
|
|
|
|
"inscriptionEnLigneEnCours": item.get("inscriptionEnLigneEnCours"), |
|
|
|
|
"avecResultatPublie": item.get("avecResultatPublie"), |
|
|
|
|
"code": item.get("code"), |
|
|
|
|
"categorieAge": item.get("categorieAge"), |
|
|
|
|
"codeComite": item.get("codeComite"), |
|
|
|
|
"installations": item.get("installations", []), |
|
|
|
|
"reductionEpreuveSupplementaireJeune": item.get( |
|
|
|
|
"reductionEpreuveSupplementaireJeune" |
|
|
|
|
), |
|
|
|
|
"reductionEpreuveSupplementaireAdulte": item.get( |
|
|
|
|
"reductionEpreuveSupplementaireAdulte" |
|
|
|
|
), |
|
|
|
|
"nomComite": item.get("nomComite"), |
|
|
|
|
"naturesEpreuves": item.get("naturesEpreuves"), |
|
|
|
|
"jeune": item.get("jeune"), |
|
|
|
|
"courrielEngagement": item.get("courrielEngagement"), |
|
|
|
|
"nomClub": item.get("nomClub"), |
|
|
|
|
"installation": item.get("installation"), |
|
|
|
|
"categorieAgeMax": item.get("categorieAgeMax"), |
|
|
|
|
"tournoiInterne": item.get("tournoiInterne"), |
|
|
|
|
"nomLigue": item.get("nomLigue"), |
|
|
|
|
"nomEngagement": item.get("nomEngagement"), |
|
|
|
|
"codeLigue": item.get("codeLigue"), |
|
|
|
|
"modeleDeBalle": item.get("modeleDeBalle"), |
|
|
|
|
"jugeArbitre": item.get("jugeArbitre"), |
|
|
|
|
"adresse2Engagement": item.get("adresse2Engagement"), |
|
|
|
|
"epreuves": item.get("epreuves"), |
|
|
|
|
"dateDebut": date_debut, |
|
|
|
|
"serie": item.get('serie'), |
|
|
|
|
"serie": item.get("serie"), |
|
|
|
|
"dateFin": date_fin, |
|
|
|
|
"dateValidation": date_validation, |
|
|
|
|
"codePostalEngagement": item.get('codePostalEngagement'), |
|
|
|
|
"codeClub": item.get('codeClub'), |
|
|
|
|
"prixEspece": item.get('prixEspece'), |
|
|
|
|
"codePostalEngagement": item.get("codePostalEngagement"), |
|
|
|
|
"codeClub": item.get("codeClub"), |
|
|
|
|
"prixEspece": item.get("prixEspece"), |
|
|
|
|
"japPhoneNumber": None, # Will be populated by separate umpire call |
|
|
|
|
|
|
|
|
|
# Additional fields from the response |
|
|
|
|
"adresse1Engagement": item.get('adresse1Engagement'), |
|
|
|
|
"originalId": item.get('originalId'), |
|
|
|
|
"familleTournoi": item.get('familleTournoi', []), |
|
|
|
|
"isTournoi": item.get('isTournoi'), |
|
|
|
|
"natureWithCatAge": item.get('natureWithCatAge') |
|
|
|
|
"adresse1Engagement": item.get("adresse1Engagement"), |
|
|
|
|
"originalId": item.get("originalId"), |
|
|
|
|
"familleTournoi": item.get("familleTournoi", []), |
|
|
|
|
"isTournoi": item.get("isTournoi"), |
|
|
|
|
"natureWithCatAge": item.get("natureWithCatAge"), |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
tournaments.append(tournament) |
|
|
|
|
|
|
|
|
|
logger.info(f"Successfully parsed {len(tournaments)} tournaments from response") |
|
|
|
|
logger.info( |
|
|
|
|
f"Successfully parsed {len(tournaments)} tournaments from response" |
|
|
|
|
) |
|
|
|
|
return { |
|
|
|
|
'tournaments': tournaments, |
|
|
|
|
'total_results': total_results, |
|
|
|
|
'current_count': len(tournaments) |
|
|
|
|
"tournaments": tournaments, |
|
|
|
|
"total_results": total_results, |
|
|
|
|
"current_count": len(tournaments), |
|
|
|
|
} |
|
|
|
|
else: |
|
|
|
|
logger.error("No recherche_tournois_update command found in AJAX response") |
|
|
|
|
return {'tournaments': [], 'total_results': 0, 'current_count': 0} |
|
|
|
|
return {"tournaments": [], "total_results": 0, "current_count": 0} |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logger.error(f"Error parsing AJAX response: {e}") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end_date=None, |
|
|
|
|
city='', distance=15, categories=None, levels=None, |
|
|
|
|
lat=None, lng=None, ages=None, tournament_types=None, |
|
|
|
|
national_cup=False): |
|
|
|
|
|
|
|
|
|
def scrape_fft_all_tournaments( |
|
|
|
|
sorting_option=None, |
|
|
|
|
page=0, |
|
|
|
|
start_date=None, |
|
|
|
|
end_date=None, |
|
|
|
|
city="", |
|
|
|
|
distance=15, |
|
|
|
|
categories=None, |
|
|
|
|
levels=None, |
|
|
|
|
lat=None, |
|
|
|
|
lng=None, |
|
|
|
|
ages=None, |
|
|
|
|
tournament_types=None, |
|
|
|
|
national_cup=False, |
|
|
|
|
): |
|
|
|
|
""" |
|
|
|
|
Scrapes FFT tournaments using Playwright with detailed debugging |
|
|
|
|
Based exactly on the working scrape_fft_club_tournaments function |
|
|
|
|
@ -341,16 +374,18 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end |
|
|
|
|
browser = p.chromium.launch(headless=True) |
|
|
|
|
page_obj = browser.new_page() |
|
|
|
|
|
|
|
|
|
page_obj.set_extra_http_headers({ |
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" |
|
|
|
|
}) |
|
|
|
|
page_obj.set_extra_http_headers( |
|
|
|
|
{ |
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" |
|
|
|
|
} |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Navigate to FFT |
|
|
|
|
target_url = "https://tenup.fft.fr/recherche/tournois" |
|
|
|
|
logger.info(f"Navigating to: {target_url}") |
|
|
|
|
|
|
|
|
|
page_obj.goto(target_url) |
|
|
|
|
# page_obj.wait_for_timeout(7000) |
|
|
|
|
page_obj.wait_for_timeout(7000) |
|
|
|
|
|
|
|
|
|
current_url = page_obj.url |
|
|
|
|
logger.info(f"Current URL: {current_url}") |
|
|
|
|
@ -367,7 +402,7 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end |
|
|
|
|
browser.close() |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
form_build_id = form_input.get_attribute('value') |
|
|
|
|
form_build_id = form_input.get_attribute("value") |
|
|
|
|
logger.info(f"Extracted form_build_id: {form_build_id}") |
|
|
|
|
|
|
|
|
|
# Build parameters - EXACT same pattern as club function |
|
|
|
|
@ -486,8 +521,8 @@ def scrape_fft_all_tournaments(sorting_option=None, page=0, start_date=None, end |
|
|
|
|
logger.info(f"AJAX Response Status: {result.get('status')}") |
|
|
|
|
logger.info(f"AJAX Response Success: {result.get('success')}") |
|
|
|
|
|
|
|
|
|
if result.get('success'): |
|
|
|
|
response_text = result.get('responseText', '') |
|
|
|
|
if result.get("success"): |
|
|
|
|
response_text = result.get("responseText", "") |
|
|
|
|
logger.info(f"Raw Response Length: {len(response_text)}") |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
@ -525,13 +560,13 @@ def get_umpire_data(tournament_id): |
|
|
|
|
browser = p.chromium.launch( |
|
|
|
|
headless=True, |
|
|
|
|
args=[ |
|
|
|
|
'--no-sandbox', |
|
|
|
|
'--disable-dev-shm-usage', |
|
|
|
|
'--disable-images', # Don't load images |
|
|
|
|
'--disable-javascript', # Disable JS for faster loading |
|
|
|
|
'--disable-plugins', |
|
|
|
|
'--disable-extensions' |
|
|
|
|
] |
|
|
|
|
"--no-sandbox", |
|
|
|
|
"--disable-dev-shm-usage", |
|
|
|
|
"--disable-images", # Don't load images |
|
|
|
|
"--disable-javascript", # Disable JS for faster loading |
|
|
|
|
"--disable-plugins", |
|
|
|
|
"--disable-extensions", |
|
|
|
|
], |
|
|
|
|
) |
|
|
|
|
page = browser.new_page() |
|
|
|
|
|
|
|
|
|
@ -553,7 +588,9 @@ def get_umpire_data(tournament_id): |
|
|
|
|
html_content = page.content() |
|
|
|
|
|
|
|
|
|
# Extract name |
|
|
|
|
name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' |
|
|
|
|
name_pattern = ( |
|
|
|
|
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' |
|
|
|
|
) |
|
|
|
|
name_match = re.search(name_pattern, html_content) |
|
|
|
|
name = name_match.group(1).strip() if name_match else None |
|
|
|
|
|
|
|
|
|
@ -569,7 +606,9 @@ def get_umpire_data(tournament_id): |
|
|
|
|
|
|
|
|
|
browser.close() |
|
|
|
|
|
|
|
|
|
logger.info(f"Extracted umpire data: name={name}, email={email}, phone={phone}") |
|
|
|
|
logger.info( |
|
|
|
|
f"Extracted umpire data: name={name}, email={email}, phone={phone}" |
|
|
|
|
) |
|
|
|
|
return name, email, phone |
|
|
|
|
|
|
|
|
|
except Exception as page_error: |
|
|
|
|
@ -581,6 +620,7 @@ def get_umpire_data(tournament_id): |
|
|
|
|
logger.error(f"Error in umpire data extraction: {e}") |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_umpire_data_requests_fallback(tournament_id): |
|
|
|
|
""" |
|
|
|
|
Fallback method using requests (may hit Queue-It) |
|
|
|
|
@ -591,7 +631,7 @@ def _get_umpire_data_requests_fallback(tournament_id): |
|
|
|
|
url = f"https://tenup.fft.fr/tournoi/{tournament_id}" |
|
|
|
|
|
|
|
|
|
headers = { |
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15' |
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=30) |
|
|
|
|
@ -603,7 +643,9 @@ def _get_umpire_data_requests_fallback(tournament_id): |
|
|
|
|
html_content = response.text |
|
|
|
|
|
|
|
|
|
# Extract using regex (original method) |
|
|
|
|
name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' |
|
|
|
|
name_pattern = ( |
|
|
|
|
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' |
|
|
|
|
) |
|
|
|
|
name_match = re.search(name_pattern, html_content) |
|
|
|
|
name = name_match.group(1).strip() if name_match else None |
|
|
|
|
|
|
|
|
|
@ -615,15 +657,31 @@ def _get_umpire_data_requests_fallback(tournament_id): |
|
|
|
|
phone_match = re.search(phone_pattern, html_content) |
|
|
|
|
phone = phone_match.group(1).strip() if phone_match else None |
|
|
|
|
|
|
|
|
|
logger.info(f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}") |
|
|
|
|
logger.info( |
|
|
|
|
f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}" |
|
|
|
|
) |
|
|
|
|
return name, email, phone |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logger.error(f"Error getting umpire data with requests: {e}") |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
def _scrape_single_page(sorting_option, page, start_date, end_date, city, distance, |
|
|
|
|
categories, levels, lat, lng, ages, tournament_types, national_cup): |
|
|
|
|
|
|
|
|
|
def _scrape_single_page( |
|
|
|
|
sorting_option, |
|
|
|
|
page, |
|
|
|
|
start_date, |
|
|
|
|
end_date, |
|
|
|
|
city, |
|
|
|
|
distance, |
|
|
|
|
categories, |
|
|
|
|
levels, |
|
|
|
|
lat, |
|
|
|
|
lng, |
|
|
|
|
ages, |
|
|
|
|
tournament_types, |
|
|
|
|
national_cup, |
|
|
|
|
): |
|
|
|
|
""" |
|
|
|
|
Helper function to scrape a single page of tournaments |
|
|
|
|
""" |
|
|
|
|
@ -640,14 +698,25 @@ def _scrape_single_page(sorting_option, page, start_date, end_date, city, distan |
|
|
|
|
lng=lng, |
|
|
|
|
ages=ages, |
|
|
|
|
tournament_types=tournament_types, |
|
|
|
|
national_cup=national_cup |
|
|
|
|
national_cup=national_cup, |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, end_date=None, |
|
|
|
|
city='', distance=15, categories=None, levels=None, |
|
|
|
|
lat=None, lng=None, ages=None, tournament_types=None, |
|
|
|
|
national_cup=False, max_workers=5): |
|
|
|
|
def scrape_fft_all_tournaments_concurrent( |
|
|
|
|
sorting_option=None, |
|
|
|
|
start_date=None, |
|
|
|
|
end_date=None, |
|
|
|
|
city="", |
|
|
|
|
distance=15, |
|
|
|
|
categories=None, |
|
|
|
|
levels=None, |
|
|
|
|
lat=None, |
|
|
|
|
lng=None, |
|
|
|
|
ages=None, |
|
|
|
|
tournament_types=None, |
|
|
|
|
national_cup=False, |
|
|
|
|
max_workers=5, |
|
|
|
|
): |
|
|
|
|
""" |
|
|
|
|
Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end) |
|
|
|
|
This assumes page 0 was already fetched by the client |
|
|
|
|
@ -668,21 +737,26 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, |
|
|
|
|
lng=lng, |
|
|
|
|
ages=ages, |
|
|
|
|
tournament_types=tournament_types, |
|
|
|
|
national_cup=national_cup |
|
|
|
|
national_cup=national_cup, |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
if not first_page_result: |
|
|
|
|
logger.error("Failed to get first page results for pagination info") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
total_results = first_page_result.get('total_results', 0) |
|
|
|
|
first_page_tournaments = first_page_result.get('tournaments', []) |
|
|
|
|
total_results = first_page_result.get("total_results", 0) |
|
|
|
|
first_page_tournaments = first_page_result.get("tournaments", []) |
|
|
|
|
results_per_page = len(first_page_tournaments) |
|
|
|
|
|
|
|
|
|
logger.info(f"Total results: {total_results}, Results per page: {results_per_page}") |
|
|
|
|
|
|
|
|
|
if total_results == 0: |
|
|
|
|
return {'tournaments': [], 'total_results': 0, 'current_count': 0, 'pages_scraped': 0} |
|
|
|
|
return { |
|
|
|
|
"tournaments": [], |
|
|
|
|
"total_results": 0, |
|
|
|
|
"current_count": 0, |
|
|
|
|
"pages_scraped": 0, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
# Calculate number of pages needed |
|
|
|
|
if results_per_page > 0: |
|
|
|
|
@ -694,7 +768,12 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, |
|
|
|
|
|
|
|
|
|
# If only one page total, return empty since page 0 was already handled |
|
|
|
|
if total_pages <= 1: |
|
|
|
|
return {'tournaments': [], 'total_results': total_results, 'current_count': 0, 'pages_scraped': 0} |
|
|
|
|
return { |
|
|
|
|
"tournaments": [], |
|
|
|
|
"total_results": total_results, |
|
|
|
|
"current_count": 0, |
|
|
|
|
"pages_scraped": 0, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
# Scrape all remaining pages concurrently (pages 1 to total_pages-1) |
|
|
|
|
all_tournaments = [] |
|
|
|
|
@ -705,8 +784,19 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, |
|
|
|
|
for page in range(1, total_pages): |
|
|
|
|
future = executor.submit( |
|
|
|
|
_scrape_single_page, |
|
|
|
|
sorting_option, page, start_date, end_date, city, distance, |
|
|
|
|
categories, levels, lat, lng, ages, tournament_types, national_cup |
|
|
|
|
sorting_option, |
|
|
|
|
page, |
|
|
|
|
start_date, |
|
|
|
|
end_date, |
|
|
|
|
city, |
|
|
|
|
distance, |
|
|
|
|
categories, |
|
|
|
|
levels, |
|
|
|
|
lat, |
|
|
|
|
lng, |
|
|
|
|
ages, |
|
|
|
|
tournament_types, |
|
|
|
|
national_cup, |
|
|
|
|
) |
|
|
|
|
futures.append((page, future)) |
|
|
|
|
|
|
|
|
|
@ -714,24 +804,30 @@ def scrape_fft_all_tournaments_concurrent(sorting_option=None, start_date=None, |
|
|
|
|
for page, future in futures: |
|
|
|
|
try: |
|
|
|
|
result = future.result(timeout=60) # 60 second timeout per page |
|
|
|
|
if result and result.get('tournaments'): |
|
|
|
|
tournaments = result.get('tournaments', []) |
|
|
|
|
if result and result.get("tournaments"): |
|
|
|
|
tournaments = result.get("tournaments", []) |
|
|
|
|
all_tournaments.extend(tournaments) |
|
|
|
|
logger.info(f"Page {page} completed: {len(tournaments)} tournaments") |
|
|
|
|
logger.info( |
|
|
|
|
f"Page {page} completed: {len(tournaments)} tournaments" |
|
|
|
|
) |
|
|
|
|
else: |
|
|
|
|
logger.warning(f"Page {page} returned no results") |
|
|
|
|
except Exception as e: |
|
|
|
|
logger.error(f"Error processing page {page}: {e}") |
|
|
|
|
|
|
|
|
|
logger.info(f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages-1} remaining pages") |
|
|
|
|
logger.info( |
|
|
|
|
f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages - 1} remaining pages" |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
'tournaments': all_tournaments, |
|
|
|
|
'total_results': total_results, |
|
|
|
|
'current_count': len(all_tournaments), |
|
|
|
|
'pages_scraped': total_pages - 1 # Excluding page 0 which was handled separately |
|
|
|
|
"tournaments": all_tournaments, |
|
|
|
|
"total_results": total_results, |
|
|
|
|
"current_count": len(all_tournaments), |
|
|
|
|
"pages_scraped": total_pages |
|
|
|
|
- 1, # Excluding page 0 which was handled separately |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_clubs_ajax_response(json_data): |
|
|
|
|
""" |
|
|
|
|
Parse the clubs AJAX response to match Swift FederalClubResponse structure |
|
|
|
|
@ -746,19 +842,24 @@ def _parse_clubs_ajax_response(json_data): |
|
|
|
|
# Try to extract clubs data from different possible response structures |
|
|
|
|
if isinstance(json_data, dict): |
|
|
|
|
# Pattern 1: Direct club_markers array |
|
|
|
|
if 'club_markers' in json_data: |
|
|
|
|
clubs_data = json_data['club_markers'] |
|
|
|
|
total_results = json_data.get('nombreResultat', len(clubs_data)) |
|
|
|
|
if "club_markers" in json_data: |
|
|
|
|
clubs_data = json_data["club_markers"] |
|
|
|
|
total_results = json_data.get("nombreResultat", len(clubs_data)) |
|
|
|
|
|
|
|
|
|
# Pattern 2: Results wrapper |
|
|
|
|
elif 'results' in json_data: |
|
|
|
|
results = json_data['results'] |
|
|
|
|
clubs_data = results.get('clubs', results.get('items', results.get('club_markers', []))) |
|
|
|
|
total_results = results.get('nombreResultat', results.get('total', results.get('nb_results', len(clubs_data)))) |
|
|
|
|
elif "results" in json_data: |
|
|
|
|
results = json_data["results"] |
|
|
|
|
clubs_data = results.get( |
|
|
|
|
"clubs", results.get("items", results.get("club_markers", [])) |
|
|
|
|
) |
|
|
|
|
total_results = results.get( |
|
|
|
|
"nombreResultat", |
|
|
|
|
results.get("total", results.get("nb_results", len(clubs_data))), |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Pattern 3: Direct array in response |
|
|
|
|
elif 'data' in json_data: |
|
|
|
|
clubs_data = json_data['data'] |
|
|
|
|
elif "data" in json_data: |
|
|
|
|
clubs_data = json_data["data"] |
|
|
|
|
total_results = len(clubs_data) |
|
|
|
|
|
|
|
|
|
# Pattern 4: Response is the clubs array directly |
|
|
|
|
@ -780,49 +881,55 @@ def _parse_clubs_ajax_response(json_data): |
|
|
|
|
if isinstance(item, dict): |
|
|
|
|
# Extract pratiques array |
|
|
|
|
pratiques = [] |
|
|
|
|
if 'pratiques' in item: |
|
|
|
|
pratiques = item['pratiques'] |
|
|
|
|
elif 'practices' in item: |
|
|
|
|
pratiques = item['practices'] |
|
|
|
|
if "pratiques" in item: |
|
|
|
|
pratiques = item["pratiques"] |
|
|
|
|
elif "practices" in item: |
|
|
|
|
pratiques = item["practices"] |
|
|
|
|
else: |
|
|
|
|
# Default to PADEL if not specified |
|
|
|
|
pratiques = ["PADEL"] |
|
|
|
|
|
|
|
|
|
# Ensure pratiques are uppercase strings |
|
|
|
|
pratiques = [p.upper() if isinstance(p, str) else str(p).upper() for p in pratiques] |
|
|
|
|
pratiques = [ |
|
|
|
|
p.upper() if isinstance(p, str) else str(p).upper() |
|
|
|
|
for p in pratiques |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
club_marker = { |
|
|
|
|
"nom": item.get('nom', item.get('name', '')), |
|
|
|
|
"clubId": str(item.get('clubId', item.get('id', item.get('code', '')))), |
|
|
|
|
"ville": item.get('ville', item.get('city', '')), |
|
|
|
|
"distance": str(item.get('distance', '0')), |
|
|
|
|
"terrainPratiqueLibelle": item.get('terrainPratiqueLibelle', item.get('courtsInfo', '')), |
|
|
|
|
"nom": item.get("nom", item.get("name", "")), |
|
|
|
|
"clubId": str( |
|
|
|
|
item.get("clubId", item.get("id", item.get("code", ""))) |
|
|
|
|
), |
|
|
|
|
"ville": item.get("ville", item.get("city", "")), |
|
|
|
|
"distance": str(item.get("distance", "0")), |
|
|
|
|
"terrainPratiqueLibelle": item.get( |
|
|
|
|
"terrainPratiqueLibelle", item.get("courtsInfo", "") |
|
|
|
|
), |
|
|
|
|
"pratiques": pratiques, |
|
|
|
|
"lat": float(item.get('lat', item.get('latitude', 0.0))), |
|
|
|
|
"lng": float(item.get('lng', item.get('longitude', 0.0))) |
|
|
|
|
"lat": float(item.get("lat", item.get("latitude", 0.0))), |
|
|
|
|
"lng": float(item.get("lng", item.get("longitude", 0.0))), |
|
|
|
|
} |
|
|
|
|
club_markers.append(club_marker) |
|
|
|
|
|
|
|
|
|
logger.info(f"Successfully parsed {len(club_markers)} club markers from response") |
|
|
|
|
logger.info( |
|
|
|
|
f"Successfully parsed {len(club_markers)} club markers from response" |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Return the response in the format expected by Swift FederalClubResponse |
|
|
|
|
return { |
|
|
|
|
"typeRecherche": "clubs", |
|
|
|
|
"nombreResultat": total_results, |
|
|
|
|
"club_markers": club_markers |
|
|
|
|
"club_markers": club_markers, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logger.error(f"Error parsing clubs AJAX response: {e}") |
|
|
|
|
return { |
|
|
|
|
"typeRecherche": "clubs", |
|
|
|
|
"nombreResultat": 0, |
|
|
|
|
"club_markers": [] |
|
|
|
|
} |
|
|
|
|
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, |
|
|
|
|
radius=15, max_workers=5): |
|
|
|
|
def scrape_federal_clubs( |
|
|
|
|
country=None, city="", latitude=None, longitude=None, radius=15, max_workers=5 |
|
|
|
|
): |
|
|
|
|
""" |
|
|
|
|
Scrapes FFT federal clubs by extracting data from the HTML response |
|
|
|
|
""" |
|
|
|
|
@ -833,16 +940,19 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, |
|
|
|
|
browser = p.chromium.launch(headless=True) |
|
|
|
|
page_obj = browser.new_page() |
|
|
|
|
|
|
|
|
|
page_obj.set_extra_http_headers({ |
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" |
|
|
|
|
}) |
|
|
|
|
page_obj.set_extra_http_headers( |
|
|
|
|
{ |
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15" |
|
|
|
|
} |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Clean up city name - remove zip code and extra info |
|
|
|
|
clean_city = city |
|
|
|
|
if city: |
|
|
|
|
import re |
|
|
|
|
clean_city = re.sub(r'[,\s]*\d{5}.*$', '', city).strip() |
|
|
|
|
clean_city = clean_city.rstrip(',').strip() |
|
|
|
|
|
|
|
|
|
clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip() |
|
|
|
|
clean_city = clean_city.rstrip(",").strip() |
|
|
|
|
|
|
|
|
|
logger.info(f"Cleaned city name: '{city}' -> '{clean_city}'") |
|
|
|
|
|
|
|
|
|
@ -856,7 +966,7 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, |
|
|
|
|
page_obj.goto(results_url) |
|
|
|
|
|
|
|
|
|
# Wait for the page to load |
|
|
|
|
page_obj.wait_for_timeout(3000) |
|
|
|
|
page_obj.wait_for_timeout(7000) |
|
|
|
|
|
|
|
|
|
# Check if we're in queue |
|
|
|
|
if "queue-it.net" in page_obj.url.lower(): |
|
|
|
|
@ -865,7 +975,7 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, |
|
|
|
|
return { |
|
|
|
|
"typeRecherche": "clubs", |
|
|
|
|
"nombreResultat": 0, |
|
|
|
|
"club_markers": [] |
|
|
|
|
"club_markers": [], |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
# Use JavaScript to extract the data directly from the page |
|
|
|
|
@ -904,45 +1014,45 @@ def scrape_federal_clubs(country=None, city='', latitude=None, longitude=None, |
|
|
|
|
|
|
|
|
|
browser.close() |
|
|
|
|
|
|
|
|
|
if result.get('success'): |
|
|
|
|
type_recherche = result.get('typeRecherche', 'club') |
|
|
|
|
total = result.get('total', 0) |
|
|
|
|
resultat = result.get('resultat', []) |
|
|
|
|
if result.get("success"): |
|
|
|
|
type_recherche = result.get("typeRecherche", "club") |
|
|
|
|
total = result.get("total", 0) |
|
|
|
|
resultat = result.get("resultat", []) |
|
|
|
|
|
|
|
|
|
logger.info(f"Successfully extracted {total} clubs") |
|
|
|
|
|
|
|
|
|
# Convert resultat to club_markers format |
|
|
|
|
club_markers = [] |
|
|
|
|
for club in resultat: |
|
|
|
|
club_markers.append({ |
|
|
|
|
"nom": club.get('nom', ''), |
|
|
|
|
"clubId": club.get('clubId', ''), |
|
|
|
|
"ville": club.get('ville', ''), |
|
|
|
|
"distance": club.get('distance', ''), |
|
|
|
|
"terrainPratiqueLibelle": club.get('terrainPratiqueLibelle', ''), |
|
|
|
|
"pratiques": club.get('pratiques', []), |
|
|
|
|
"lat": club.get('lat', 0.0), |
|
|
|
|
"lng": club.get('lng', 0.0) |
|
|
|
|
}) |
|
|
|
|
club_markers.append( |
|
|
|
|
{ |
|
|
|
|
"nom": club.get("nom", ""), |
|
|
|
|
"clubId": club.get("clubId", ""), |
|
|
|
|
"ville": club.get("ville", ""), |
|
|
|
|
"distance": club.get("distance", ""), |
|
|
|
|
"terrainPratiqueLibelle": club.get( |
|
|
|
|
"terrainPratiqueLibelle", "" |
|
|
|
|
), |
|
|
|
|
"pratiques": club.get("pratiques", []), |
|
|
|
|
"lat": club.get("lat", 0.0), |
|
|
|
|
"lng": club.get("lng", 0.0), |
|
|
|
|
} |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
return { |
|
|
|
|
"typeRecherche": type_recherche, |
|
|
|
|
"nombreResultat": total, |
|
|
|
|
"club_markers": club_markers |
|
|
|
|
"club_markers": club_markers, |
|
|
|
|
} |
|
|
|
|
else: |
|
|
|
|
logger.error(f"Failed to extract data: {result.get('error')}") |
|
|
|
|
return { |
|
|
|
|
"typeRecherche": "clubs", |
|
|
|
|
"nombreResultat": 0, |
|
|
|
|
"club_markers": [] |
|
|
|
|
"club_markers": [], |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logger.error(f"Error in federal clubs scraping: {e}") |
|
|
|
|
logger.error(f"Traceback: {traceback.format_exc()}") |
|
|
|
|
return { |
|
|
|
|
"typeRecherche": "clubs", |
|
|
|
|
"nombreResultat": 0, |
|
|
|
|
"club_markers": [] |
|
|
|
|
} |
|
|
|
|
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} |
|
|
|
|
|