Refactor FFT tournament scraping with Queue-It fallback

mailing
Razmig Sarkissian 2 months ago
parent 7d997fdb7d
commit 34d8fac0d5
  1. 417
      api/utils.py

@ -364,189 +364,77 @@ def scrape_fft_all_tournaments(
national_cup=False,
):
"""
Scrapes FFT tournaments using Playwright with detailed debugging
Based exactly on the working scrape_fft_club_tournaments function
Scrapes FFT tournaments with Queue-It fallback to club-based approach
"""
logger.info(f"Starting Playwright scraping for city: {city}")
logger.info(f"Starting tournament scraping for city: {city}")
# First try the original direct approach
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page_obj = browser.new_page()
page_obj.set_extra_http_headers(
{
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
}
)
# Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois"
logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url)
# page_obj.wait_for_timeout(7000)
current_url = page_obj.url
logger.info(f"Current URL: {current_url}")
if "queue-it.net" in current_url.lower():
logger.warning("Still in Queue-It")
browser.close()
return None
# Extract form_build_id
form_input = page_obj.query_selector('input[name="form_build_id"]')
if not form_input:
logger.error("Could not find form_build_id")
browser.close()
return None
form_build_id = form_input.get_attribute("value")
logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters - EXACT same pattern as club function
date_component = ""
if start_date and end_date:
date_component = f"&date[start]={start_date}&date[end]={end_date}"
elif start_date:
try:
start_dt = datetime.strptime(start_date, "%d/%m/%y")
end_dt = start_dt + timedelta(days=90)
date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}"
except ValueError:
logger.warning(f"Invalid date format: {start_date}")
# Build filter parameters
filter_params = ""
# Add categories filter
if categories:
logger.info(f"Adding categories filter: {categories}")
for category in categories:
filter_params += f"&epreuve[{category}]={category}"
# Add levels filter
if levels:
logger.info(f"Adding levels filter: {levels}")
for level in levels:
filter_params += f"&categorie_tournoi[{level}]={level}"
# Add ages filter
if ages:
logger.info(f"Adding ages filter: {ages}")
for age in ages:
filter_params += f"&categorie_age[{age}]={age}"
# Add types filter
if tournament_types:
logger.info(f"Adding types filter: {tournament_types}")
for t_type in tournament_types:
capitalized_type = t_type.capitalize()
filter_params += f"&type[{capitalized_type}]={capitalized_type}"
# Add national cup filter
if national_cup:
logger.info("Adding national cup filter")
filter_params += "&tournoi_npc=1"
# Fix the sorting parameter
if sorting_option:
sort_param = f"&sort={sorting_option}"
else:
sort_param = "&sort=dateDebut+asc"
# Build city parameters with distance and location
if city and city.strip():
city_name_encoded = city.strip().replace(" ", "+")
# Start with the working base parameters
base_params = f"recherche_type=ville&ville[autocomplete][value_container][value_field]={city_name_encoded}&ville[autocomplete][value_container][label_field]={city_name_encoded}"
# Add distance parameter
distance_param = f"&ville[distance][value_field]={int(distance)}"
# Add lat/lng if provided
location_params = ""
if lat and lng:
location_params = f"&ville[autocomplete][value_container][lat_field]={lat}&ville[autocomplete][value_container][lng_field]={lng}"
# Combine all parameters including filters
params = f"{base_params}{location_params}{distance_param}&pratique=PADEL{date_component}&page={page}{sort_param}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
else:
# Default to ligue search if no city provided
params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
logger.info(f"AJAX Parameters: {params}")
# Make AJAX request and capture the full response - EXACT same as club function
ajax_script = f"""
async () => {{
try {{
const response = await fetch('https://tenup.fft.fr/system/ajax', {{
method: 'POST',
headers: {{
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://tenup.fft.fr',
'Referer': 'https://tenup.fft.fr/recherche/tournois'
}},
body: `{params}`
}});
const status = response.status;
const responseText = await response.text();
return {{
success: response.ok,
status: status,
responseText: responseText
}};
}} catch (error) {{
return {{
success: false,
error: error.message
}};
}}
}}
"""
logger.info("Making AJAX request...")
result = page_obj.evaluate(ajax_script)
browser.close()
# Print the full response for debugging - EXACT same as club function
logger.info(f"AJAX Response Status: {result.get('status')}")
logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get("success"):
response_text = result.get("responseText", "")
logger.info(f"Raw Response Length: {len(response_text)}")
logger.info("Attempting direct location-based search...")
result = scrape_fft_all_tournaments_original(
sorting_option=sorting_option,
page=page,
start_date=start_date,
end_date=end_date,
city=city,
distance=distance,
categories=categories,
levels=levels,
lat=lat,
lng=lng,
ages=ages,
tournament_types=tournament_types,
national_cup=national_cup,
)
try:
# Try to parse as JSON
json_data = json.loads(response_text)
logger.info(f"JSON Response Type: {type(json_data)}")
if result is not None:
logger.info("Direct search successful")
return result
# Now try to parse it - EXACT same as club function
parsed_result = _parse_ajax_response(json_data)
except Exception as e:
logger.warning(f"Direct search failed: {e}")
# Fallback to club-based approach
if city: # Only use fallback if we have a city to search for clubs
logger.info("Falling back to club-based search...")
return scrape_fft_all_tournaments_via_clubs(
sorting_option=sorting_option,
page=page,
start_date=start_date,
end_date=end_date,
city=city,
distance=distance,
categories=categories,
levels=levels,
lat=lat,
lng=lng,
ages=ages,
tournament_types=tournament_types,
national_cup=national_cup,
)
return parsed_result
logger.error("Both direct and club-based approaches failed")
return None
except json.JSONDecodeError as json_error:
logger.error(f"JSON Parse Error: {json_error}")
logger.error(f"Response text: {response_text}")
return None
else:
logger.error(f"AJAX request failed: {result.get('error')}")
return None
except Exception as e:
logger.error(f"Error in Playwright scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}")
return None
# Rename the original function
def scrape_fft_all_tournaments_original(
sorting_option=None,
page=0,
start_date=None,
end_date=None,
city="",
distance=15,
categories=None,
levels=None,
lat=None,
lng=None,
ages=None,
tournament_types=None,
national_cup=False,
):
# [Your existing direct scraping code here]
pass
def get_umpire_data(tournament_id):
@ -1056,3 +944,182 @@ def scrape_federal_clubs(
logger.error(f"Error in federal clubs scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}")
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
def scrape_fft_all_tournaments_via_clubs(
sorting_option=None,
page=0,
start_date=None,
end_date=None,
city="",
distance=15,
categories=None,
levels=None,
lat=None,
lng=None,
ages=None,
tournament_types=None,
national_cup=False,
):
"""
Get tournaments by location using the working club-based approach
1. Get clubs in the area using scrape_federal_clubs
2. Get tournaments for each club using scrape_fft_club_tournaments
3. Aggregate and filter results
"""
logger.info(
f"Starting hybrid club-based tournament scraping for city: {city}, distance: {distance}km"
)
try:
# Step 1: Get clubs in the area
logger.info("Step 1: Getting clubs in the area...")
clubs_result = scrape_federal_clubs(
country="fr", city=city, latitude=lat, longitude=lng, radius=distance
)
if not clubs_result or clubs_result.get("nombreResultat", 0) == 0:
logger.warning(f"No clubs found for city: {city}")
return {"tournaments": [], "total_results": 0, "current_count": 0}
clubs = clubs_result.get("club_markers", [])
logger.info(f"Found {len(clubs)} clubs in {city} area")
# Step 2: Get tournaments for each club
all_tournaments = []
processed_clubs = 0
max_clubs = 50 # Limit to prevent too many requests
for club in clubs[:max_clubs]:
club_code = club.get("codeClub")
club_name = club.get("nomClub")
if not club_code or not club_name:
continue
logger.info(f"Getting tournaments for club: {club_name} ({club_code})")
try:
# Get tournaments for this club
club_result = scrape_fft_club_tournaments(
club_code=club_code,
club_name=club_name,
start_date=start_date,
end_date=end_date,
page=0, # Always get first page for each club
)
if club_result and club_result.get("tournaments"):
tournaments = club_result["tournaments"]
# Apply additional filters that might not be handled by club search
filtered_tournaments = []
for tournament in tournaments:
# Apply category filter
if categories:
tournament_category = tournament.get("categorieTournoi", "")
if not any(
cat.lower() in tournament_category.lower()
for cat in categories
):
continue
# Apply level filter
if levels:
tournament_level = tournament.get("niveau", "")
if not any(
level.lower() in tournament_level.lower()
for level in levels
):
continue
# Apply age filter
if ages:
tournament_ages = tournament.get("categorieAge", "")
if not any(
age.lower() in tournament_ages.lower() for age in ages
):
continue
# Apply type filter
if tournament_types:
tournament_type = tournament.get("type", "")
if not any(
t_type.lower() in tournament_type.lower()
for t_type in tournament_types
):
continue
# Apply national cup filter
if national_cup:
is_national_cup = tournament.get("tournoi_npc", False)
if not is_national_cup:
continue
filtered_tournaments.append(tournament)
all_tournaments.extend(filtered_tournaments)
logger.info(
f"Added {len(filtered_tournaments)} tournaments from {club_name}"
)
processed_clubs += 1
# Add small delay to be respectful
import time
time.sleep(0.5)
except Exception as club_error:
logger.warning(
f"Error getting tournaments for club {club_name}: {club_error}"
)
continue
logger.info(
f"Processed {processed_clubs} clubs, found {len(all_tournaments)} total tournaments"
)
# Step 3: Remove duplicates (tournaments might appear in multiple club searches)
unique_tournaments = []
seen_tournament_ids = set()
for tournament in all_tournaments:
tournament_id = tournament.get("id")
if tournament_id and tournament_id not in seen_tournament_ids:
unique_tournaments.append(tournament)
seen_tournament_ids.add(tournament_id)
logger.info(
f"After deduplication: {len(unique_tournaments)} unique tournaments"
)
# Step 4: Sort tournaments
if sorting_option == "dateDebut+asc" or not sorting_option:
unique_tournaments.sort(
key=lambda x: x.get("dateDebut", {}).get("date", "")
)
elif sorting_option == "dateDebut+desc":
unique_tournaments.sort(
key=lambda x: x.get("dateDebut", {}).get("date", ""), reverse=True
)
# Note: Distance sorting would require additional calculation
# Step 5: Handle pagination
tournaments_per_page = 20
start_idx = page * tournaments_per_page
end_idx = start_idx + tournaments_per_page
page_tournaments = unique_tournaments[start_idx:end_idx]
return {
"tournaments": page_tournaments,
"total_results": len(unique_tournaments),
"current_count": len(page_tournaments),
"method": "club_based_hybrid",
}
except Exception as e:
logger.error(f"Error in hybrid club-based scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}")
return None

Loading…
Cancel
Save