You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1125 lines
40 KiB
1125 lines
40 KiB
import logging
|
|
import requests
|
|
import re
|
|
from playwright.sync_api import sync_playwright
|
|
from datetime import datetime, timedelta
|
|
import json
|
|
import traceback
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def check_version_smaller_than_1_1_12(version_str):
|
|
# Remove the parentheses part if it exists, example of version: 1.1.12 (2)
|
|
version_str = version_str.split()[0]
|
|
if version_str:
|
|
# Split version into components
|
|
version_parts = [int(x) for x in version_str.split(".")]
|
|
target_parts = [1, 1, 12]
|
|
# Compare version components
|
|
return version_parts < target_parts
|
|
else:
|
|
return False
|
|
|
|
|
|
def scrape_fft_club_tournaments(
|
|
club_code, club_name, start_date=None, end_date=None, page=0
|
|
):
|
|
"""
|
|
Scrapes FFT tournaments using Playwright with detailed debugging
|
|
"""
|
|
logger.info(f"Starting Playwright scraping for {club_name}")
|
|
try:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page_obj = browser.new_page()
|
|
|
|
page_obj.set_extra_http_headers(
|
|
{
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
|
|
}
|
|
)
|
|
|
|
# Navigate to FFT
|
|
target_url = "https://tenup.fft.fr/recherche/tournois"
|
|
logger.info(f"Navigating to: {target_url}")
|
|
|
|
page_obj.goto(target_url)
|
|
# page_obj.wait_for_timeout(7000)
|
|
|
|
current_url = page_obj.url
|
|
logger.info(f"Current URL: {current_url}")
|
|
|
|
if "queue-it.net" in current_url.lower():
|
|
logger.warning("Still in Queue-It")
|
|
browser.close()
|
|
return None
|
|
|
|
# Extract form_build_id
|
|
form_input = page_obj.query_selector('input[name="form_build_id"]')
|
|
if not form_input:
|
|
logger.error("Could not find form_build_id")
|
|
browser.close()
|
|
return None
|
|
|
|
form_build_id = form_input.get_attribute("value")
|
|
logger.info(f"Extracted form_build_id: {form_build_id}")
|
|
|
|
# Build parameters
|
|
date_component = ""
|
|
if start_date and end_date:
|
|
date_component = f"&date[start]={start_date}&date[end]={end_date}"
|
|
elif start_date:
|
|
try:
|
|
start_dt = datetime.strptime(start_date, "%d/%m/%y")
|
|
end_dt = start_dt + timedelta(days=90)
|
|
date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}"
|
|
except ValueError:
|
|
logger.warning(f"Invalid date format: {start_date}")
|
|
|
|
club_name_encoded = club_name.replace(" ", "+")
|
|
club_code_clean = club_code.replace(" ", "")
|
|
|
|
params = f"recherche_type=club&club[autocomplete][value_container][value_field]={club_code_clean}&club[autocomplete][value_container][label_field]={club_name_encoded}&pratique=PADEL{date_component}&page={page}&sort=dateDebut+asc&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
|
|
|
|
logger.info(f"AJAX Parameters: {params}")
|
|
|
|
# Make AJAX request and capture the full response
|
|
ajax_script = f"""
|
|
async () => {{
|
|
try {{
|
|
const response = await fetch('https://tenup.fft.fr/system/ajax', {{
|
|
method: 'POST',
|
|
headers: {{
|
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
|
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
'Origin': 'https://tenup.fft.fr',
|
|
'Referer': 'https://tenup.fft.fr/recherche/tournois'
|
|
}},
|
|
body: `{params}`
|
|
}});
|
|
|
|
const status = response.status;
|
|
const responseText = await response.text();
|
|
|
|
return {{
|
|
success: response.ok,
|
|
status: status,
|
|
responseText: responseText
|
|
}};
|
|
}} catch (error) {{
|
|
return {{
|
|
success: false,
|
|
error: error.message
|
|
}};
|
|
}}
|
|
}}
|
|
"""
|
|
|
|
logger.info("Making AJAX request...")
|
|
result = page_obj.evaluate(ajax_script)
|
|
|
|
browser.close()
|
|
|
|
# Print the full response for debugging
|
|
logger.info(f"AJAX Response Status: {result.get('status')}")
|
|
logger.info(f"AJAX Response Success: {result.get('success')}")
|
|
|
|
if result.get("success"):
|
|
response_text = result.get("responseText", "")
|
|
logger.info(f"Raw Response Length: {len(response_text)}")
|
|
# logger.info(f"Raw Response (first 500 chars): {response_text[:500]}")
|
|
|
|
try:
|
|
# Try to parse as JSON
|
|
json_data = json.loads(response_text)
|
|
logger.info(f"JSON Response Type: {type(json_data)}")
|
|
# logger.info(f"JSON Response: {json.dumps(json_data, indent=2, default=str)}")
|
|
|
|
# Now try to parse it
|
|
parsed_result = _parse_ajax_response(json_data)
|
|
# logger.info(f"Parsed Result: {parsed_result}")
|
|
|
|
return parsed_result
|
|
|
|
except json.JSONDecodeError as json_error:
|
|
logger.error(f"JSON Parse Error: {json_error}")
|
|
logger.error(f"Response text: {response_text}")
|
|
return None
|
|
else:
|
|
logger.error(f"AJAX request failed: {result.get('error')}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in Playwright scraping: {e}")
|
|
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
return None
|
|
|
|
|
|
def scrape_fft_club_tournaments_all_pages(
|
|
club_code, club_name, start_date=None, end_date=None
|
|
):
|
|
"""
|
|
Scrapes all pages of FFT tournaments for a specific club
|
|
"""
|
|
logger.info(f"Starting complete tournament scraping for {club_name}")
|
|
|
|
all_tournaments = []
|
|
page = 0
|
|
|
|
while True:
|
|
try:
|
|
# Call the working single-page function
|
|
result = scrape_fft_club_tournaments(
|
|
club_code=club_code,
|
|
club_name=club_name,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
page=page,
|
|
)
|
|
|
|
# Debug: Log what we got
|
|
logger.info(f"Page {page} result: {result}")
|
|
|
|
if not result:
|
|
logger.warning(f"No result for page {page}")
|
|
break
|
|
|
|
tournaments = result.get("tournaments", [])
|
|
logger.info(f"Page {page} returned {len(tournaments)} tournaments")
|
|
|
|
if not tournaments:
|
|
logger.info(f"No tournaments on page {page}, stopping")
|
|
break
|
|
|
|
all_tournaments.extend(tournaments)
|
|
|
|
# Check if we have all results
|
|
total_results = result.get("total_results", 0)
|
|
logger.info(
|
|
f"Total so far: {len(all_tournaments)}, Target: {total_results}"
|
|
)
|
|
|
|
if len(all_tournaments) >= total_results:
|
|
logger.info("Got all tournaments, stopping")
|
|
break
|
|
|
|
page += 1
|
|
logger.info(f"Moving to page {page}")
|
|
# time.sleep(1) # Rate limiting
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error on page {page}: {e}")
|
|
break
|
|
|
|
logger.info(
|
|
f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages"
|
|
)
|
|
|
|
return {
|
|
"tournaments": all_tournaments,
|
|
"total_results": len(all_tournaments),
|
|
"current_count": len(all_tournaments),
|
|
"pages_scraped": page + 1,
|
|
}
|
|
|
|
|
|
def _parse_ajax_response(commands):
|
|
"""
|
|
Parse the AJAX response commands to extract tournament data
|
|
Returns data in the exact format expected by Swift FederalTournament struct
|
|
"""
|
|
tournaments = []
|
|
|
|
try:
|
|
# Check for alert commands (maintenance mode)
|
|
for command in commands:
|
|
if command.get("command") == "alert":
|
|
logger.warning("Maintenance mode detected")
|
|
return None
|
|
|
|
# Find the command with results
|
|
result_command = None
|
|
for command in commands:
|
|
if command.get("command") == "recherche_tournois_update":
|
|
result_command = command
|
|
logger.info("Found recherche_tournois_update command!")
|
|
break
|
|
|
|
if result_command and result_command.get("results"):
|
|
results = result_command["results"]
|
|
items = results.get("items", [])
|
|
total_results = results.get("nb_results", 0)
|
|
|
|
logger.info(f"Processing {len(items)} tournaments from results")
|
|
|
|
for item in items:
|
|
# Parse dates - they're already in the correct format
|
|
date_debut = item.get("dateDebut")
|
|
date_fin = item.get("dateFin")
|
|
date_validation = item.get("dateValidation")
|
|
|
|
# Build the tournament object to match Swift FederalTournament structure
|
|
tournament = {
|
|
"id": str(item.get("id", "")),
|
|
"millesime": item.get("millesime"),
|
|
"libelle": item.get("libelle"),
|
|
"tmc": item.get("tmc"),
|
|
"tarifAdulteChampionnat": item.get("tarifAdulteChampionnat"),
|
|
"type": item.get("type"),
|
|
"ageReel": item.get("ageReel"),
|
|
"naturesTerrains": item.get("naturesTerrains", []),
|
|
"idsArbitres": item.get("idsArbitres", []),
|
|
"tarifJeuneChampionnat": item.get("tarifJeuneChampionnat"),
|
|
"international": item.get("international"),
|
|
"inscriptionEnLigne": item.get("inscriptionEnLigne"),
|
|
"categorieTournoi": item.get("categorieTournoi"),
|
|
"prixLot": item.get("prixLot"),
|
|
"paiementEnLigne": item.get("paiementEnLigne"),
|
|
"reductionAdherentJeune": item.get("reductionAdherentJeune"),
|
|
"reductionAdherentAdulte": item.get("reductionAdherentAdulte"),
|
|
"paiementEnLigneObligatoire": item.get(
|
|
"paiementEnLigneObligatoire"
|
|
),
|
|
"villeEngagement": item.get("villeEngagement"),
|
|
"senior": item.get("senior"),
|
|
"veteran": item.get("veteran"),
|
|
"inscriptionEnLigneEnCours": item.get("inscriptionEnLigneEnCours"),
|
|
"avecResultatPublie": item.get("avecResultatPublie"),
|
|
"code": item.get("code"),
|
|
"categorieAge": item.get("categorieAge"),
|
|
"codeComite": item.get("codeComite"),
|
|
"installations": item.get("installations", []),
|
|
"reductionEpreuveSupplementaireJeune": item.get(
|
|
"reductionEpreuveSupplementaireJeune"
|
|
),
|
|
"reductionEpreuveSupplementaireAdulte": item.get(
|
|
"reductionEpreuveSupplementaireAdulte"
|
|
),
|
|
"nomComite": item.get("nomComite"),
|
|
"naturesEpreuves": item.get("naturesEpreuves"),
|
|
"jeune": item.get("jeune"),
|
|
"courrielEngagement": item.get("courrielEngagement"),
|
|
"nomClub": item.get("nomClub"),
|
|
"installation": item.get("installation"),
|
|
"categorieAgeMax": item.get("categorieAgeMax"),
|
|
"tournoiInterne": item.get("tournoiInterne"),
|
|
"nomLigue": item.get("nomLigue"),
|
|
"nomEngagement": item.get("nomEngagement"),
|
|
"codeLigue": item.get("codeLigue"),
|
|
"modeleDeBalle": item.get("modeleDeBalle"),
|
|
"jugeArbitre": item.get("jugeArbitre"),
|
|
"adresse2Engagement": item.get("adresse2Engagement"),
|
|
"epreuves": item.get("epreuves"),
|
|
"dateDebut": date_debut,
|
|
"serie": item.get("serie"),
|
|
"dateFin": date_fin,
|
|
"dateValidation": date_validation,
|
|
"codePostalEngagement": item.get("codePostalEngagement"),
|
|
"codeClub": item.get("codeClub"),
|
|
"prixEspece": item.get("prixEspece"),
|
|
"japPhoneNumber": None, # Will be populated by separate umpire call
|
|
# Additional fields from the response
|
|
"adresse1Engagement": item.get("adresse1Engagement"),
|
|
"originalId": item.get("originalId"),
|
|
"familleTournoi": item.get("familleTournoi", []),
|
|
"isTournoi": item.get("isTournoi"),
|
|
"natureWithCatAge": item.get("natureWithCatAge"),
|
|
}
|
|
|
|
tournaments.append(tournament)
|
|
|
|
logger.info(
|
|
f"Successfully parsed {len(tournaments)} tournaments from response"
|
|
)
|
|
return {
|
|
"tournaments": tournaments,
|
|
"total_results": total_results,
|
|
"current_count": len(tournaments),
|
|
}
|
|
else:
|
|
logger.error("No recherche_tournois_update command found in AJAX response")
|
|
return {"tournaments": [], "total_results": 0, "current_count": 0}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing AJAX response: {e}")
|
|
return None
|
|
|
|
|
|
def scrape_fft_all_tournaments(
|
|
sorting_option=None,
|
|
page=0,
|
|
start_date=None,
|
|
end_date=None,
|
|
city="",
|
|
distance=15,
|
|
categories=None,
|
|
levels=None,
|
|
lat=None,
|
|
lng=None,
|
|
ages=None,
|
|
tournament_types=None,
|
|
national_cup=False,
|
|
):
|
|
"""
|
|
Scrapes FFT tournaments with Queue-It fallback to club-based approach
|
|
"""
|
|
logger.info(f"Starting tournament scraping for city: {city}")
|
|
|
|
# First try the original direct approach
|
|
try:
|
|
logger.info("Attempting direct location-based search...")
|
|
result = scrape_fft_all_tournaments_original(
|
|
sorting_option=sorting_option,
|
|
page=page,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
city=city,
|
|
distance=distance,
|
|
categories=categories,
|
|
levels=levels,
|
|
lat=lat,
|
|
lng=lng,
|
|
ages=ages,
|
|
tournament_types=tournament_types,
|
|
national_cup=national_cup,
|
|
)
|
|
|
|
if result is not None:
|
|
logger.info("Direct search successful")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Direct search failed: {e}")
|
|
|
|
# Fallback to club-based approach
|
|
if city: # Only use fallback if we have a city to search for clubs
|
|
logger.info("Falling back to club-based search...")
|
|
return scrape_fft_all_tournaments_via_clubs(
|
|
sorting_option=sorting_option,
|
|
page=page,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
city=city,
|
|
distance=distance,
|
|
categories=categories,
|
|
levels=levels,
|
|
lat=lat,
|
|
lng=lng,
|
|
ages=ages,
|
|
tournament_types=tournament_types,
|
|
national_cup=national_cup,
|
|
)
|
|
|
|
logger.error("Both direct and club-based approaches failed")
|
|
return None
|
|
|
|
|
|
# Rename the original function
|
|
def scrape_fft_all_tournaments_original(
|
|
sorting_option=None,
|
|
page=0,
|
|
start_date=None,
|
|
end_date=None,
|
|
city="",
|
|
distance=15,
|
|
categories=None,
|
|
levels=None,
|
|
lat=None,
|
|
lng=None,
|
|
ages=None,
|
|
tournament_types=None,
|
|
national_cup=False,
|
|
):
|
|
# [Your existing direct scraping code here]
|
|
pass
|
|
|
|
|
|
def get_umpire_data(tournament_id):
|
|
"""
|
|
Fast umpire data extraction using Playwright (optimized for speed)
|
|
"""
|
|
logger.info(f"Getting umpire data for tournament {tournament_id}")
|
|
|
|
try:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(
|
|
headless=True,
|
|
args=[
|
|
"--no-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--disable-images", # Don't load images
|
|
"--disable-javascript", # Disable JS for faster loading
|
|
"--disable-plugins",
|
|
"--disable-extensions",
|
|
],
|
|
)
|
|
page = browser.new_page()
|
|
|
|
# Navigate to tournament page quickly
|
|
url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
|
|
logger.info(f"Navigating to tournament page: {url}")
|
|
|
|
try:
|
|
# Fast navigation - don't wait for everything to load
|
|
page.goto(url, timeout=15000, wait_until="domcontentloaded")
|
|
|
|
# Quick Queue-It check
|
|
if "queue-it.net" in page.url.lower():
|
|
logger.warning("Hit Queue-It on tournament page")
|
|
browser.close()
|
|
return None, None, None
|
|
|
|
# Extract data using the fastest method - regex on HTML content
|
|
html_content = page.content()
|
|
|
|
# Extract name
|
|
name_pattern = (
|
|
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
|
|
)
|
|
name_match = re.search(name_pattern, html_content)
|
|
name = name_match.group(1).strip() if name_match else None
|
|
|
|
# Extract email
|
|
email_pattern = r'mailto:([^"]+)"'
|
|
email_match = re.search(email_pattern, html_content)
|
|
email = email_match.group(1) if email_match else None
|
|
|
|
# Extract phone
|
|
phone_pattern = r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>'
|
|
phone_match = re.search(phone_pattern, html_content)
|
|
phone = phone_match.group(1).strip() if phone_match else None
|
|
|
|
browser.close()
|
|
|
|
logger.info(
|
|
f"Extracted umpire data: name={name}, email={email}, phone={phone}"
|
|
)
|
|
return name, email, phone
|
|
|
|
except Exception as page_error:
|
|
logger.error(f"Error loading tournament page: {page_error}")
|
|
browser.close()
|
|
return None, None, None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in umpire data extraction: {e}")
|
|
return None, None, None
|
|
|
|
|
|
def _get_umpire_data_requests_fallback(tournament_id):
|
|
"""
|
|
Fallback method using requests (may hit Queue-It)
|
|
"""
|
|
logger.info(f"Using requests fallback for tournament {tournament_id}")
|
|
|
|
try:
|
|
url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
|
|
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
|
|
if response.status_code != 200:
|
|
logger.error(f"Failed to fetch tournament page: {response.status_code}")
|
|
return None, None, None
|
|
|
|
html_content = response.text
|
|
|
|
# Extract using regex (original method)
|
|
name_pattern = (
|
|
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
|
|
)
|
|
name_match = re.search(name_pattern, html_content)
|
|
name = name_match.group(1).strip() if name_match else None
|
|
|
|
email_pattern = r'mailto:([^"]+)"'
|
|
email_match = re.search(email_pattern, html_content)
|
|
email = email_match.group(1) if email_match else None
|
|
|
|
phone_pattern = r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>'
|
|
phone_match = re.search(phone_pattern, html_content)
|
|
phone = phone_match.group(1).strip() if phone_match else None
|
|
|
|
logger.info(
|
|
f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}"
|
|
)
|
|
return name, email, phone
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting umpire data with requests: {e}")
|
|
return None, None, None
|
|
|
|
|
|
def _scrape_single_page(
|
|
sorting_option,
|
|
page,
|
|
start_date,
|
|
end_date,
|
|
city,
|
|
distance,
|
|
categories,
|
|
levels,
|
|
lat,
|
|
lng,
|
|
ages,
|
|
tournament_types,
|
|
national_cup,
|
|
):
|
|
"""
|
|
Helper function to scrape a single page of tournaments
|
|
"""
|
|
return scrape_fft_all_tournaments(
|
|
sorting_option=sorting_option,
|
|
page=page,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
city=city,
|
|
distance=distance,
|
|
categories=categories,
|
|
levels=levels,
|
|
lat=lat,
|
|
lng=lng,
|
|
ages=ages,
|
|
tournament_types=tournament_types,
|
|
national_cup=national_cup,
|
|
)
|
|
|
|
|
|
def scrape_fft_all_tournaments_concurrent(
|
|
sorting_option=None,
|
|
start_date=None,
|
|
end_date=None,
|
|
city="",
|
|
distance=15,
|
|
categories=None,
|
|
levels=None,
|
|
lat=None,
|
|
lng=None,
|
|
ages=None,
|
|
tournament_types=None,
|
|
national_cup=False,
|
|
max_workers=5,
|
|
):
|
|
"""
|
|
Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end)
|
|
This assumes page 0 was already fetched by the client
|
|
"""
|
|
logger.info(f"Starting concurrent scraping for remaining tournament pages")
|
|
|
|
# First, get the first page to determine total results and pages
|
|
first_page_result = scrape_fft_all_tournaments(
|
|
sorting_option=sorting_option,
|
|
page=0,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
city=city,
|
|
distance=distance,
|
|
categories=categories,
|
|
levels=levels,
|
|
lat=lat,
|
|
lng=lng,
|
|
ages=ages,
|
|
tournament_types=tournament_types,
|
|
national_cup=national_cup,
|
|
)
|
|
|
|
if not first_page_result:
|
|
logger.error("Failed to get first page results for pagination info")
|
|
return None
|
|
|
|
total_results = first_page_result.get("total_results", 0)
|
|
first_page_tournaments = first_page_result.get("tournaments", [])
|
|
results_per_page = len(first_page_tournaments)
|
|
|
|
logger.info(f"Total results: {total_results}, Results per page: {results_per_page}")
|
|
|
|
if total_results == 0:
|
|
return {
|
|
"tournaments": [],
|
|
"total_results": 0,
|
|
"current_count": 0,
|
|
"pages_scraped": 0,
|
|
}
|
|
|
|
# Calculate number of pages needed
|
|
if results_per_page > 0:
|
|
total_pages = (total_results + results_per_page - 1) // results_per_page
|
|
else:
|
|
total_pages = 1
|
|
|
|
logger.info(f"Total pages: {total_pages}")
|
|
|
|
# If only one page total, return empty since page 0 was already handled
|
|
if total_pages <= 1:
|
|
return {
|
|
"tournaments": [],
|
|
"total_results": total_results,
|
|
"current_count": 0,
|
|
"pages_scraped": 0,
|
|
}
|
|
|
|
# Scrape all remaining pages concurrently (pages 1 to total_pages-1)
|
|
all_tournaments = []
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
futures = []
|
|
|
|
for page in range(1, total_pages):
|
|
future = executor.submit(
|
|
_scrape_single_page,
|
|
sorting_option,
|
|
page,
|
|
start_date,
|
|
end_date,
|
|
city,
|
|
distance,
|
|
categories,
|
|
levels,
|
|
lat,
|
|
lng,
|
|
ages,
|
|
tournament_types,
|
|
national_cup,
|
|
)
|
|
futures.append((page, future))
|
|
|
|
# Collect results as they complete
|
|
for page, future in futures:
|
|
try:
|
|
result = future.result(timeout=60) # 60 second timeout per page
|
|
if result and result.get("tournaments"):
|
|
tournaments = result.get("tournaments", [])
|
|
all_tournaments.extend(tournaments)
|
|
logger.info(
|
|
f"Page {page} completed: {len(tournaments)} tournaments"
|
|
)
|
|
else:
|
|
logger.warning(f"Page {page} returned no results")
|
|
except Exception as e:
|
|
logger.error(f"Error processing page {page}: {e}")
|
|
|
|
logger.info(
|
|
f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages - 1} remaining pages"
|
|
)
|
|
|
|
return {
|
|
"tournaments": all_tournaments,
|
|
"total_results": total_results,
|
|
"current_count": len(all_tournaments),
|
|
"pages_scraped": total_pages
|
|
- 1, # Excluding page 0 which was handled separately
|
|
}
|
|
|
|
|
|
def _parse_clubs_ajax_response(json_data):
|
|
"""
|
|
Parse the clubs AJAX response to match Swift FederalClubResponse structure
|
|
"""
|
|
try:
|
|
# Log the raw response structure to understand what we're getting
|
|
logger.info(f"Raw clubs response structure: {json_data}")
|
|
|
|
club_markers = []
|
|
total_results = 0
|
|
|
|
# Try to extract clubs data from different possible response structures
|
|
if isinstance(json_data, dict):
|
|
# Pattern 1: Direct club_markers array
|
|
if "club_markers" in json_data:
|
|
clubs_data = json_data["club_markers"]
|
|
total_results = json_data.get("nombreResultat", len(clubs_data))
|
|
|
|
# Pattern 2: Results wrapper
|
|
elif "results" in json_data:
|
|
results = json_data["results"]
|
|
clubs_data = results.get(
|
|
"clubs", results.get("items", results.get("club_markers", []))
|
|
)
|
|
total_results = results.get(
|
|
"nombreResultat",
|
|
results.get("total", results.get("nb_results", len(clubs_data))),
|
|
)
|
|
|
|
# Pattern 3: Direct array in response
|
|
elif "data" in json_data:
|
|
clubs_data = json_data["data"]
|
|
total_results = len(clubs_data)
|
|
|
|
# Pattern 4: Response is the clubs array directly
|
|
else:
|
|
clubs_data = json_data if isinstance(json_data, list) else []
|
|
total_results = len(clubs_data)
|
|
|
|
elif isinstance(json_data, list):
|
|
clubs_data = json_data
|
|
total_results = len(clubs_data)
|
|
|
|
else:
|
|
logger.error(f"Unexpected response format: {type(json_data)}")
|
|
clubs_data = []
|
|
total_results = 0
|
|
|
|
# Parse each club to match ClubMarker structure
|
|
for item in clubs_data:
|
|
if isinstance(item, dict):
|
|
# Extract pratiques array
|
|
pratiques = []
|
|
if "pratiques" in item:
|
|
pratiques = item["pratiques"]
|
|
elif "practices" in item:
|
|
pratiques = item["practices"]
|
|
else:
|
|
# Default to PADEL if not specified
|
|
pratiques = ["PADEL"]
|
|
|
|
# Ensure pratiques are uppercase strings
|
|
pratiques = [
|
|
p.upper() if isinstance(p, str) else str(p).upper()
|
|
for p in pratiques
|
|
]
|
|
|
|
club_marker = {
|
|
"nom": item.get("nom", item.get("name", "")),
|
|
"clubId": str(
|
|
item.get("clubId", item.get("id", item.get("code", "")))
|
|
),
|
|
"ville": item.get("ville", item.get("city", "")),
|
|
"distance": str(item.get("distance", "0")),
|
|
"terrainPratiqueLibelle": item.get(
|
|
"terrainPratiqueLibelle", item.get("courtsInfo", "")
|
|
),
|
|
"pratiques": pratiques,
|
|
"lat": float(item.get("lat", item.get("latitude", 0.0))),
|
|
"lng": float(item.get("lng", item.get("longitude", 0.0))),
|
|
}
|
|
club_markers.append(club_marker)
|
|
|
|
logger.info(
|
|
f"Successfully parsed {len(club_markers)} club markers from response"
|
|
)
|
|
|
|
# Return the response in the format expected by Swift FederalClubResponse
|
|
return {
|
|
"typeRecherche": "clubs",
|
|
"nombreResultat": total_results,
|
|
"club_markers": club_markers,
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing clubs AJAX response: {e}")
|
|
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
|
|
|
|
|
|
def scrape_federal_clubs(
|
|
country=None, city="", latitude=None, longitude=None, radius=15, max_workers=5
|
|
):
|
|
"""
|
|
Scrapes FFT federal clubs by extracting data from the HTML response
|
|
"""
|
|
logger.info(f"Starting federal clubs scraping for city: {city}, country: {country}")
|
|
|
|
try:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page_obj = browser.new_page()
|
|
|
|
page_obj.set_extra_http_headers(
|
|
{
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
|
|
}
|
|
)
|
|
|
|
# Clean up city name - remove zip code and extra info
|
|
clean_city = city
|
|
if city:
|
|
import re
|
|
|
|
clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip()
|
|
clean_city = clean_city.rstrip(",").strip()
|
|
|
|
logger.info(f"Cleaned city name: '{city}' -> '{clean_city}'")
|
|
|
|
# Build the results URL directly
|
|
params = f"ville={clean_city}&pratique=PADEL&distance={int(radius)}&country={country or 'fr'}"
|
|
results_url = f"https://tenup.fft.fr/recherche/clubs/resultats?{params}"
|
|
|
|
logger.info(f"Requesting results URL: {results_url}")
|
|
|
|
# Navigate to the results page
|
|
page_obj.goto(results_url)
|
|
|
|
# Wait for the page to load
|
|
page_obj.wait_for_timeout(3000)
|
|
|
|
# Check if we're in queue
|
|
if "queue-it.net" in page_obj.url.lower():
|
|
logger.warning("Hit Queue-It on results page")
|
|
browser.close()
|
|
return {
|
|
"typeRecherche": "clubs",
|
|
"nombreResultat": 0,
|
|
"club_markers": [],
|
|
}
|
|
|
|
# Use JavaScript to extract the data directly from the page
|
|
extraction_script = """
|
|
() => {
|
|
try {
|
|
// Check if Drupal.settings exists and has the data
|
|
if (typeof Drupal !== 'undefined' &&
|
|
Drupal.settings &&
|
|
Drupal.settings.fft_recherche_club) {
|
|
|
|
const data = Drupal.settings.fft_recherche_club;
|
|
|
|
return {
|
|
success: true,
|
|
typeRecherche: data.typeRecherche || 'club',
|
|
total: data.total || 0,
|
|
resultat: data.resultat || []
|
|
};
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
error: 'Drupal.settings.fft_recherche_club not found'
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
success: false,
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
"""
|
|
|
|
result = page_obj.evaluate(extraction_script)
|
|
|
|
browser.close()
|
|
|
|
if result.get("success"):
|
|
type_recherche = result.get("typeRecherche", "club")
|
|
total = result.get("total", 0)
|
|
resultat = result.get("resultat", [])
|
|
|
|
logger.info(f"Successfully extracted {total} clubs")
|
|
|
|
# Convert resultat to club_markers format
|
|
club_markers = []
|
|
for club in resultat:
|
|
club_markers.append(
|
|
{
|
|
"nom": club.get("nom", ""),
|
|
"clubId": club.get("clubId", ""),
|
|
"ville": club.get("ville", ""),
|
|
"distance": club.get("distance", ""),
|
|
"terrainPratiqueLibelle": club.get(
|
|
"terrainPratiqueLibelle", ""
|
|
),
|
|
"pratiques": club.get("pratiques", []),
|
|
"lat": club.get("lat", 0.0),
|
|
"lng": club.get("lng", 0.0),
|
|
}
|
|
)
|
|
|
|
return {
|
|
"typeRecherche": type_recherche,
|
|
"nombreResultat": total,
|
|
"club_markers": club_markers,
|
|
}
|
|
else:
|
|
logger.error(f"Failed to extract data: {result.get('error')}")
|
|
return {
|
|
"typeRecherche": "clubs",
|
|
"nombreResultat": 0,
|
|
"club_markers": [],
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in federal clubs scraping: {e}")
|
|
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
|
|
|
|
|
|
def scrape_fft_all_tournaments_via_clubs(
|
|
sorting_option=None,
|
|
page=0,
|
|
start_date=None,
|
|
end_date=None,
|
|
city="",
|
|
distance=15,
|
|
categories=None,
|
|
levels=None,
|
|
lat=None,
|
|
lng=None,
|
|
ages=None,
|
|
tournament_types=None,
|
|
national_cup=False,
|
|
):
|
|
"""
|
|
Get tournaments by location using the working club-based approach
|
|
1. Get clubs in the area using scrape_federal_clubs
|
|
2. Get tournaments for each club using scrape_fft_club_tournaments
|
|
3. Aggregate and filter results
|
|
"""
|
|
logger.info(
|
|
f"Starting hybrid club-based tournament scraping for city: {city}, distance: {distance}km"
|
|
)
|
|
|
|
try:
|
|
# Step 1: Get clubs in the area
|
|
logger.info("Step 1: Getting clubs in the area...")
|
|
clubs_result = scrape_federal_clubs(
|
|
country="fr", city=city, latitude=lat, longitude=lng, radius=distance
|
|
)
|
|
|
|
if not clubs_result or clubs_result.get("nombreResultat", 0) == 0:
|
|
logger.warning(f"No clubs found for city: {city}")
|
|
return {"tournaments": [], "total_results": 0, "current_count": 0}
|
|
|
|
clubs = clubs_result.get("club_markers", [])
|
|
logger.info(f"Found {len(clubs)} clubs in {city} area")
|
|
|
|
# Step 2: Get tournaments for each club
|
|
all_tournaments = []
|
|
processed_clubs = 0
|
|
max_clubs = 50 # Limit to prevent too many requests
|
|
|
|
for club in clubs[:max_clubs]:
|
|
club_code = club.get("codeClub")
|
|
club_name = club.get("nomClub")
|
|
|
|
if not club_code or not club_name:
|
|
continue
|
|
|
|
logger.info(f"Getting tournaments for club: {club_name} ({club_code})")
|
|
|
|
try:
|
|
# Get tournaments for this club
|
|
club_result = scrape_fft_club_tournaments(
|
|
club_code=club_code,
|
|
club_name=club_name,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
page=0, # Always get first page for each club
|
|
)
|
|
|
|
if club_result and club_result.get("tournaments"):
|
|
tournaments = club_result["tournaments"]
|
|
|
|
# Apply additional filters that might not be handled by club search
|
|
filtered_tournaments = []
|
|
for tournament in tournaments:
|
|
# Apply category filter
|
|
if categories:
|
|
tournament_category = tournament.get("categorieTournoi", "")
|
|
if not any(
|
|
cat.lower() in tournament_category.lower()
|
|
for cat in categories
|
|
):
|
|
continue
|
|
|
|
# Apply level filter
|
|
if levels:
|
|
tournament_level = tournament.get("niveau", "")
|
|
if not any(
|
|
level.lower() in tournament_level.lower()
|
|
for level in levels
|
|
):
|
|
continue
|
|
|
|
# Apply age filter
|
|
if ages:
|
|
tournament_ages = tournament.get("categorieAge", "")
|
|
if not any(
|
|
age.lower() in tournament_ages.lower() for age in ages
|
|
):
|
|
continue
|
|
|
|
# Apply type filter
|
|
if tournament_types:
|
|
tournament_type = tournament.get("type", "")
|
|
if not any(
|
|
t_type.lower() in tournament_type.lower()
|
|
for t_type in tournament_types
|
|
):
|
|
continue
|
|
|
|
# Apply national cup filter
|
|
if national_cup:
|
|
is_national_cup = tournament.get("tournoi_npc", False)
|
|
if not is_national_cup:
|
|
continue
|
|
|
|
filtered_tournaments.append(tournament)
|
|
|
|
all_tournaments.extend(filtered_tournaments)
|
|
logger.info(
|
|
f"Added {len(filtered_tournaments)} tournaments from {club_name}"
|
|
)
|
|
|
|
processed_clubs += 1
|
|
|
|
# Add small delay to be respectful
|
|
import time
|
|
|
|
time.sleep(0.5)
|
|
|
|
except Exception as club_error:
|
|
logger.warning(
|
|
f"Error getting tournaments for club {club_name}: {club_error}"
|
|
)
|
|
continue
|
|
|
|
logger.info(
|
|
f"Processed {processed_clubs} clubs, found {len(all_tournaments)} total tournaments"
|
|
)
|
|
|
|
# Step 3: Remove duplicates (tournaments might appear in multiple club searches)
|
|
unique_tournaments = []
|
|
seen_tournament_ids = set()
|
|
|
|
for tournament in all_tournaments:
|
|
tournament_id = tournament.get("id")
|
|
if tournament_id and tournament_id not in seen_tournament_ids:
|
|
unique_tournaments.append(tournament)
|
|
seen_tournament_ids.add(tournament_id)
|
|
|
|
logger.info(
|
|
f"After deduplication: {len(unique_tournaments)} unique tournaments"
|
|
)
|
|
|
|
# Step 4: Sort tournaments
|
|
if sorting_option == "dateDebut+asc" or not sorting_option:
|
|
unique_tournaments.sort(
|
|
key=lambda x: x.get("dateDebut", {}).get("date", "")
|
|
)
|
|
elif sorting_option == "dateDebut+desc":
|
|
unique_tournaments.sort(
|
|
key=lambda x: x.get("dateDebut", {}).get("date", ""), reverse=True
|
|
)
|
|
# Note: Distance sorting would require additional calculation
|
|
|
|
# Step 5: Handle pagination
|
|
tournaments_per_page = 20
|
|
start_idx = page * tournaments_per_page
|
|
end_idx = start_idx + tournaments_per_page
|
|
|
|
page_tournaments = unique_tournaments[start_idx:end_idx]
|
|
|
|
return {
|
|
"tournaments": page_tournaments,
|
|
"total_results": len(unique_tournaments),
|
|
"current_count": len(page_tournaments),
|
|
"method": "club_based_hybrid",
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in hybrid club-based scraping: {e}")
|
|
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
return None
|
|
|