You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
padelclub_backend/api/utils.py

1290 lines
48 KiB

import logging
import requests
import re
from playwright.sync_api import sync_playwright
from datetime import datetime, timedelta
import json
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from django.conf import settings
logger = logging.getLogger(__name__)
def check_version_smaller_than_1_1_12(version_str):
# Remove the parentheses part if it exists, example of version: 1.1.12 (2)
version_str = version_str.split()[0]
if version_str:
# Split version into components
version_parts = [int(x) for x in version_str.split(".")]
target_parts = [1, 1, 12]
# Compare version components
return version_parts < target_parts
else:
return False
def scrape_fft_club_tournaments(
club_code, club_name, start_date=None, end_date=None, page=0
):
"""
Scrapes FFT tournaments using Playwright with detailed debugging
"""
# logger.info(f"Starting Playwright scraping for {club_name}")
try:
with sync_playwright() as p:
browser = get_browser_for_environment(p)
page_obj = browser.new_page()
page_obj.set_extra_http_headers(
{
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
}
)
# Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois"
# # logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url)
page_obj.wait_for_timeout(2000)
current_url = page_obj.url
# logger.info(f"Current URL: {current_url}")
if "queue-it.net" in current_url.lower():
logger.warning("Still in Queue-It")
browser.close()
return None
# Extract form_build_id
form_input = page_obj.query_selector('input[name="form_build_id"]')
if not form_input:
logger.error("Could not find form_build_id")
browser.close()
return None
form_build_id = form_input.get_attribute("value")
# # logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters
date_component = ""
if start_date and end_date:
date_component = f"&date[start]={start_date}&date[end]={end_date}"
elif start_date:
try:
start_dt = datetime.strptime(start_date, "%d/%m/%y")
end_dt = start_dt + timedelta(days=90)
date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}"
except ValueError:
logger.warning(f"Invalid date format: {start_date}")
club_name_encoded = club_name.replace(" ", "+")
club_code_clean = club_code.replace(" ", "")
params = f"recherche_type=club&club[autocomplete][value_container][value_field]={club_code_clean}&club[autocomplete][value_container][label_field]={club_name_encoded}&pratique=PADEL{date_component}&page={page}&sort=dateDebut+asc&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
# # logger.info(f"AJAX Parameters: {params}")
# Make AJAX request and capture the full response
ajax_script = f"""
async () => {{
try {{
const response = await fetch('https://tenup.fft.fr/system/ajax', {{
method: 'POST',
headers: {{
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://tenup.fft.fr',
'Referer': 'https://tenup.fft.fr/recherche/tournois'
}},
body: `{params}`
}});
const status = response.status;
const responseText = await response.text();
return {{
success: response.ok,
status: status,
responseText: responseText
}};
}} catch (error) {{
return {{
success: false,
error: error.message
}};
}}
}}
"""
# # logger.info("Making AJAX request...")
result = page_obj.evaluate(ajax_script)
browser.close()
# Print the full response for debugging
# # logger.info(f"AJAX Response Status: {result.get('status')}")
# # logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get("success"):
response_text = result.get("responseText", "")
# # logger.info(f"Raw Response Length: {len(response_text)}")
# # logger.info(f"Raw Response (first 500 chars): {response_text[:500]}")
try:
# Try to parse as JSON
json_data = json.loads(response_text)
# # logger.info(f"JSON Response Type: {type(json_data)}")
# # logger.info(f"JSON Response: {json.dumps(json_data, indent=2, default=str)}")
# Now try to parse it
parsed_result = _parse_ajax_response(json_data)
# # logger.info(f"Parsed Result: {parsed_result}")
return parsed_result
except json.JSONDecodeError as json_error:
logger.error(f"JSON Parse Error: {json_error}")
logger.error(f"Response text: {response_text}")
return None
else:
logger.error(f"AJAX request failed: {result.get('error')}")
return None
except Exception as e:
logger.error(f"Error in Playwright scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}")
return None
def scrape_fft_club_tournaments_all_pages(
club_code, club_name, start_date=None, end_date=None
):
"""
Scrapes all pages of FFT tournaments for a specific club
"""
# # logger.info(f"Starting complete tournament scraping for {club_name}")
all_tournaments = []
page = 0
while True:
try:
# Call the working single-page function
result = scrape_fft_club_tournaments(
club_code=club_code,
club_name=club_name,
start_date=start_date,
end_date=end_date,
page=page,
)
# Debug: Log what we got
# logger.info(f"Page {page} result: {result}")
if not result:
logger.warning(f"No result for page {page}")
break
tournaments = result.get("tournaments", [])
# logger.info(f"Page {page} returned {len(tournaments)} tournaments")
if not tournaments:
# logger.info(f"No tournaments on page {page}, stopping")
break
all_tournaments.extend(tournaments)
# Check if we have all results
total_results = result.get("total_results", 0)
# logger.info(
# f"Total so far: {len(all_tournaments)}, Target: {total_results}"
# )
if len(all_tournaments) >= total_results:
# logger.info("Got all tournaments, stopping")
break
page += 1
# logger.info(f"Moving to page {page}")
# time.sleep(1) # Rate limiting
except Exception as e:
logger.error(f"Error on page {page}: {e}")
break
# logger.info(
# f"Completed scraping: {len(all_tournaments)} tournaments across {page + 1} pages"
# )
return {
"tournaments": all_tournaments,
"total_results": len(all_tournaments),
"current_count": len(all_tournaments),
"pages_scraped": page + 1,
}
def _parse_ajax_response(commands):
"""
Parse the AJAX response commands to extract tournament data
Returns data in the exact format expected by Swift FederalTournament struct
"""
tournaments = []
try:
# Check for alert commands (maintenance mode)
for command in commands:
if command.get("command") == "alert":
# logger.warning("Maintenance mode detected")
return None
# Find the command with results
result_command = None
for command in commands:
if command.get("command") == "recherche_tournois_update":
result_command = command
# # logger.info("Found recherche_tournois_update command!")
break
if result_command and result_command.get("results"):
results = result_command["results"]
items = results.get("items", [])
total_results = results.get("nb_results", 0)
# # logger.info(f"Processing {len(items)} tournaments from results")
for item in items:
# Parse dates - they're already in the correct format
date_debut = item.get("dateDebut")
date_fin = item.get("dateFin")
date_validation = item.get("dateValidation")
# Build the tournament object to match Swift FederalTournament structure
tournament = {
"id": str(item.get("id", "")),
"millesime": item.get("millesime"),
"libelle": item.get("libelle"),
"tmc": item.get("tmc"),
"tarifAdulteChampionnat": item.get("tarifAdulteChampionnat"),
"type": item.get("type"),
"ageReel": item.get("ageReel"),
"naturesTerrains": item.get("naturesTerrains", []),
"idsArbitres": item.get("idsArbitres", []),
"tarifJeuneChampionnat": item.get("tarifJeuneChampionnat"),
"international": item.get("international"),
"inscriptionEnLigne": item.get("inscriptionEnLigne"),
"categorieTournoi": item.get("categorieTournoi"),
"prixLot": item.get("prixLot"),
"paiementEnLigne": item.get("paiementEnLigne"),
"reductionAdherentJeune": item.get("reductionAdherentJeune"),
"reductionAdherentAdulte": item.get("reductionAdherentAdulte"),
"paiementEnLigneObligatoire": item.get(
"paiementEnLigneObligatoire"
),
"villeEngagement": item.get("villeEngagement"),
"senior": item.get("senior"),
"veteran": item.get("veteran"),
"inscriptionEnLigneEnCours": item.get("inscriptionEnLigneEnCours"),
"avecResultatPublie": item.get("avecResultatPublie"),
"code": item.get("code"),
"categorieAge": item.get("categorieAge"),
"codeComite": item.get("codeComite"),
"installations": item.get("installations", []),
"reductionEpreuveSupplementaireJeune": item.get(
"reductionEpreuveSupplementaireJeune"
),
"reductionEpreuveSupplementaireAdulte": item.get(
"reductionEpreuveSupplementaireAdulte"
),
"nomComite": item.get("nomComite"),
"naturesEpreuves": item.get("naturesEpreuves"),
"jeune": item.get("jeune"),
"courrielEngagement": item.get("courrielEngagement"),
"nomClub": item.get("nomClub"),
"installation": item.get("installation"),
"categorieAgeMax": item.get("categorieAgeMax"),
"tournoiInterne": item.get("tournoiInterne"),
"nomLigue": item.get("nomLigue"),
"nomEngagement": item.get("nomEngagement"),
"codeLigue": item.get("codeLigue"),
"modeleDeBalle": item.get("modeleDeBalle"),
"jugeArbitre": item.get("jugeArbitre"),
"adresse2Engagement": item.get("adresse2Engagement"),
"epreuves": item.get("epreuves"),
"dateDebut": date_debut,
"serie": item.get("serie"),
"dateFin": date_fin,
"dateValidation": date_validation,
"codePostalEngagement": item.get("codePostalEngagement"),
"codeClub": item.get("codeClub"),
"prixEspece": item.get("prixEspece"),
"japPhoneNumber": None, # Will be populated by separate umpire call
# Additional fields from the response
"adresse1Engagement": item.get("adresse1Engagement"),
"originalId": item.get("originalId"),
"familleTournoi": item.get("familleTournoi", []),
"isTournoi": item.get("isTournoi"),
"natureWithCatAge": item.get("natureWithCatAge"),
}
tournaments.append(tournament)
# # logger.info(
# f"Successfully parsed {len(tournaments)} tournaments from response"
# )
return {
"tournaments": tournaments,
"total_results": total_results,
"current_count": len(tournaments),
}
else:
logger.error("No recherche_tournois_update command found in AJAX response")
return {"tournaments": [], "total_results": 0, "current_count": 0}
except Exception as e:
logger.error(f"Error parsing AJAX response: {e}")
return None
def scrape_fft_all_tournaments(
sorting_option=None,
page=0,
start_date=None,
end_date=None,
city="",
distance=15,
categories=None,
levels=None,
lat=None,
lng=None,
ages=None,
tournament_types=None,
national_cup=False,
):
"""
Scrapes FFT tournaments using Playwright with detailed debugging
Based exactly on the working scrape_fft_club_tournaments function
"""
# logger.info(f"Starting Playwright scraping for city: {city}")
try:
with sync_playwright() as p:
browser = get_browser_for_environment(p)
page_obj = browser.new_page()
page_obj.set_extra_http_headers(
{
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
}
)
# Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois"
# # logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url)
page_obj.wait_for_timeout(500)
current_url = page_obj.url
# # logger.info(f"Current URL: {current_url}")
if "queue-it.net" in current_url.lower():
# logger.warning("Still in Queue-It")
browser.close()
return None
# Extract form_build_id
form_input = page_obj.query_selector('input[name="form_build_id"]')
if not form_input:
# logger.error("Could not find form_build_id")
browser.close()
return None
form_build_id = form_input.get_attribute("value")
# # logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters - EXACT same pattern as club function
date_component = ""
if start_date and end_date:
date_component = f"&date[start]={start_date}&date[end]={end_date}"
elif start_date:
try:
start_dt = datetime.strptime(start_date, "%d/%m/%y")
end_dt = start_dt + timedelta(days=90)
date_component = f"&date[start]={start_date}&date[end]={end_dt.strftime('%d/%m/%y')}"
except ValueError:
logger.warning(f"Invalid date format: {start_date}")
# Build filter parameters
filter_params = ""
# Add categories filter
if categories:
# # logger.info(f"Adding categories filter: {categories}")
for category in categories:
filter_params += f"&epreuve[{category}]={category}"
# Add levels filter
if levels:
# # logger.info(f"Adding levels filter: {levels}")
for level in levels:
filter_params += f"&categorie_tournoi[{level}]={level}"
# Add ages filter
if ages:
# # logger.info(f"Adding ages filter: {ages}")
for age in ages:
filter_params += f"&categorie_age[{age}]={age}"
# Add types filter
if tournament_types:
# # logger.info(f"Adding types filter: {tournament_types}")
for t_type in tournament_types:
capitalized_type = t_type.capitalize()
filter_params += f"&type[{capitalized_type}]={capitalized_type}"
# Add national cup filter
if national_cup:
# # logger.info("Adding national cup filter")
filter_params += "&tournoi_npc=1"
# Fix the sorting parameter
if sorting_option:
sort_param = f"&sort={sorting_option}"
else:
sort_param = "&sort=dateDebut+asc"
# Build city parameters with distance and location
if city and city.strip():
city_name_encoded = city.strip().replace(" ", "+")
# Start with the working base parameters
base_params = f"recherche_type=ville&ville[autocomplete][value_container][value_field]={city_name_encoded}&ville[autocomplete][value_container][label_field]={city_name_encoded}"
# Add distance parameter
distance_param = f"&ville[distance][value_field]={int(distance)}"
# Add lat/lng if provided
location_params = ""
if lat and lng:
location_params = f"&ville[autocomplete][value_container][lat_field]={lat}&ville[autocomplete][value_container][lng_field]={lng}"
# Combine all parameters including filters
params = f"{base_params}{location_params}{distance_param}&pratique=PADEL{date_component}&page={page}{sort_param}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
else:
# Default to ligue search if no city provided
params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
# # logger.info(f"AJAX Parameters: {params}")
# Make AJAX request and capture the full response - EXACT same as club function
ajax_script = f"""
async () => {{
try {{
const response = await fetch('https://tenup.fft.fr/system/ajax', {{
method: 'POST',
headers: {{
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://tenup.fft.fr',
'Referer': 'https://tenup.fft.fr/recherche/tournois'
}},
body: `{params}`
}});
const status = response.status;
const responseText = await response.text();
return {{
success: response.ok,
status: status,
responseText: responseText
}};
}} catch (error) {{
return {{
success: false,
error: error.message
}};
}}
}}
"""
# # logger.info("Making AJAX request...")
result = page_obj.evaluate(ajax_script)
browser.close()
# Print the full response for debugging - EXACT same as club function
# # logger.info(f"AJAX Response Status: {result.get('status')}")
# # logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get("success"):
response_text = result.get("responseText", "")
# # logger.info(f"Raw Response Length: {len(response_text)}")
try:
# Try to parse as JSON
json_data = json.loads(response_text)
# # logger.info(f"JSON Response Type: {type(json_data)}")
# Now try to parse it - EXACT same as club function
parsed_result = _parse_ajax_response(json_data)
return parsed_result
except json.JSONDecodeError as json_error:
# logger.error(f"JSON Parse Error: {json_error}")
# logger.error(f"Response text: {response_text}")
return None
else:
# logger.error(f"AJAX request failed: {result.get('error')}")
return None
except Exception as e:
logger.error(f"Error in Playwright scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}")
return None
def get_umpire_data(tournament_id):
"""
Umpire data extraction with improved Queue-It handling
"""
# # logger.info(f"Getting umpire data for tournament {tournament_id}")
try:
with sync_playwright() as p:
browser = get_browser_for_environment(p)
page = browser.new_page()
# Navigate to tournament page
url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
# # logger.info(f"Navigating to tournament page: {url}")
try:
# Navigate with reasonable timeout
page.goto(url, timeout=30000, wait_until="domcontentloaded")
# Enhanced Queue-It handling (similar to tournament search)
if "queue-it.net" in page.url.lower():
# logger.warning(f"Queue-It detected for tournament {tournament_id}")
# Wait strategy based on environment
max_queue_wait = (
120000 if not settings.DEBUG else 30000
) # 2 min prod, 30s dev
check_interval = 1000 # 10 seconds
elapsed_time = 0
# # logger.info(
# f"Waiting in queue for umpire data (max {max_queue_wait / 1000}s)..."
# )
while (
elapsed_time < max_queue_wait
and "queue-it.net" in page.url.lower()
):
# # logger.info(
# f"Umpire queue wait: {elapsed_time / 1000}s elapsed"
# )
page.wait_for_timeout(check_interval)
elapsed_time += check_interval
# Check if we've been redirected
if "queue-it.net" not in page.url.lower():
# # logger.info(
# "Successfully passed through queue for umpire data!"
# )
break
# If still in queue, try fallback or return None
if "queue-it.net" in page.url.lower():
# logger.warning(
# "Umpire queue timeout - trying requests fallback"
# )
browser.close()
return _get_umpire_data_requests_fallback(tournament_id)
# Wait for page to load properly
page.wait_for_load_state("networkidle", timeout=20000)
# Extract data using multiple strategies
html_content = page.content()
# Strategy 1: Try existing regex patterns
name, email, phone = _extract_umpire_with_regex(html_content)
# Strategy 2: If regex fails, try DOM selectors
if not name and not email and not phone:
name, email, phone = _extract_umpire_with_selectors(page)
browser.close()
if name or email or phone:
# # logger.info(
# f"Successfully extracted umpire data: name={name}, email={email}, phone={phone}"
# )
return name, email, phone
else:
logger.warning(
f"No umpire data found for tournament {tournament_id}"
)
return None, None, None
except Exception as page_error:
logger.error(
f"Error loading tournament page {tournament_id}: {page_error}"
)
browser.close()
# Try requests fallback
# logger.info("Trying requests fallback after Playwright error")
return _get_umpire_data_requests_fallback(tournament_id)
except Exception as e:
logger.error(f"Error in umpire data extraction for {tournament_id}: {e}")
return None, None, None
def _extract_umpire_with_regex(html_content):
"""
Extract umpire data using regex patterns
"""
# Extract name
name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
name_match = re.search(name_pattern, html_content)
name = name_match.group(1).strip() if name_match else None
# Extract email
email_pattern = r'mailto:([^"]+)"'
email_match = re.search(email_pattern, html_content)
email = email_match.group(1) if email_match else None
# Extract phone - try multiple patterns
phone_patterns = [
r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>',
r"(\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{2})",
r"(\d{10})",
r"(\+33\s?\d{1}\s?\d{2}\s?\d{2}\s?\d{2}\s?\d{2})",
]
phone = None
for pattern in phone_patterns:
phone_match = re.search(pattern, html_content)
if phone_match:
phone = phone_match.group(1).strip()
break
return name, email, phone
def _extract_umpire_with_selectors(page):
"""
Extract umpire data using DOM selectors as fallback
"""
name = None
email = None
phone = None
try:
# Try different selectors for name
name_selectors = [
".tournoi-detail-page-inscription-responsable-title",
'[class*="responsable-title"]',
'[class*="umpire-name"]',
'h3:has-text("Responsable")',
]
for selector in name_selectors:
try:
element = page.query_selector(selector)
if element:
name = element.inner_text().strip()
if name:
break
except:
continue
# Try different selectors for email
email_selectors = [
'a[href^="mailto:"]',
'[class*="email"]',
]
for selector in email_selectors:
try:
element = page.query_selector(selector)
if element:
href = element.get_attribute("href")
if href and href.startswith("mailto:"):
email = href.replace("mailto:", "")
break
text = element.inner_text().strip()
if "@" in text:
email = text
break
except:
continue
# Try different selectors for phone
phone_selectors = [
".details-bloc",
'[class*="phone"]',
'[class*="telephone"]',
]
for selector in phone_selectors:
try:
element = page.query_selector(selector)
if element:
text = element.inner_text().strip()
# Check if text looks like a phone number
if re.match(r"[\d\s\.\+\-\(\)]{8,}", text):
phone = text
break
except:
continue
except Exception as e:
logger.warning(f"Error in selector-based extraction: {e}")
return name, email, phone
def _get_umpire_data_requests_fallback(tournament_id):
"""
Enhanced fallback method using requests
"""
# logger.info(f"Using requests fallback for tournament {tournament_id}")
try:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
response = requests.get(url, headers=headers, timeout=15)
if "queue-it.net" in response.url:
# logger.warning("Requests fallback also hit Queue-It")
return None, None, None
if response.status_code != 200:
# logger.error(f"Requests fallback failed: {response.status_code}")
return None, None, None
# Try regex extraction on requests response
name, email, phone = _extract_umpire_with_regex(response.text)
# If regex fails, try BeautifulSoup
if not name and not email and not phone:
try:
soup = BeautifulSoup(response.text, "html.parser")
# Try to find name
name_elements = soup.find_all(
class_=lambda x: x and "responsable" in x.lower()
)
if name_elements:
name = name_elements[0].get_text().strip()
# Try to find email
email_links = soup.find_all(
"a", href=lambda x: x and x.startswith("mailto:")
)
if email_links:
email = email_links[0]["href"].replace("mailto:", "")
# Try to find phone in various elements
for element in soup.find_all(text=True):
if re.search(
r"\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}",
str(element),
):
phone = str(element).strip()
break
except Exception as soup_error:
logger.warning(f"BeautifulSoup parsing failed: {soup_error}")
# logger.info(
# f"Requests fallback result: name={name}, email={email}, phone={phone}"
# )
return name, email, phone
except Exception as e:
logger.error(f"Requests fallback error: {e}")
return None, None, None
def _get_umpire_data_requests_fallback(tournament_id):
"""
Fallback method using requests (may hit Queue-It)
"""
# logger.info(f"Using requests fallback for tournament {tournament_id}")
try:
url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
}
response = requests.get(url, headers=headers, timeout=30)
if response.status_code != 200:
logger.error(f"Failed to fetch tournament page: {response.status_code}")
return None, None, None
html_content = response.text
# Extract using regex (original method)
name_pattern = (
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
)
name_match = re.search(name_pattern, html_content)
name = name_match.group(1).strip() if name_match else None
email_pattern = r'mailto:([^"]+)"'
email_match = re.search(email_pattern, html_content)
email = email_match.group(1) if email_match else None
phone_pattern = r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>'
phone_match = re.search(phone_pattern, html_content)
phone = phone_match.group(1).strip() if phone_match else None
# logger.info(
# f"Extracted umpire data (requests): name={name}, email={email}, phone={phone}"
# )
return name, email, phone
except Exception as e:
logger.error(f"Error getting umpire data with requests: {e}")
return None, None, None
def _scrape_single_page(
sorting_option,
page,
start_date,
end_date,
city,
distance,
categories,
levels,
lat,
lng,
ages,
tournament_types,
national_cup,
):
"""
Helper function to scrape a single page of tournaments
"""
return scrape_fft_all_tournaments(
sorting_option=sorting_option,
page=page,
start_date=start_date,
end_date=end_date,
city=city,
distance=distance,
categories=categories,
levels=levels,
lat=lat,
lng=lng,
ages=ages,
tournament_types=tournament_types,
national_cup=national_cup,
)
def scrape_fft_all_tournaments_concurrent(
sorting_option=None,
start_date=None,
end_date=None,
city="",
distance=15,
categories=None,
levels=None,
lat=None,
lng=None,
ages=None,
tournament_types=None,
national_cup=False,
max_workers=10,
):
"""
Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end)
This assumes page 0 was already fetched by the client
"""
# logger.info(f"Starting concurrent scraping for remaining tournament pages")
# First, get the first page to determine total results and pages
first_page_result = scrape_fft_all_tournaments(
sorting_option=sorting_option,
page=0,
start_date=start_date,
end_date=end_date,
city=city,
distance=distance,
categories=categories,
levels=levels,
lat=lat,
lng=lng,
ages=ages,
tournament_types=tournament_types,
national_cup=national_cup,
)
if not first_page_result:
logger.error("Failed to get first page results for pagination info")
return None
total_results = first_page_result.get("total_results", 0)
first_page_tournaments = first_page_result.get("tournaments", [])
results_per_page = len(first_page_tournaments)
# logger.info(f"Total results: {total_results}, Results per page: {results_per_page}")
if total_results == 0:
return {
"tournaments": [],
"total_results": 0,
"current_count": 0,
"pages_scraped": 0,
}
# Calculate number of pages needed
if results_per_page > 0:
total_pages = (total_results + results_per_page - 1) // results_per_page
else:
total_pages = 1
# logger.info(f"Total pages: {total_pages}")
# If only one page total, return empty since page 0 was already handled
if total_pages <= 1:
return {
"tournaments": [],
"total_results": total_results,
"current_count": 0,
"pages_scraped": 0,
}
# Scrape all remaining pages concurrently (pages 1 to total_pages-1)
all_tournaments = []
max_concurrent = min(total_pages, 20)
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
futures = []
for page in range(1, total_pages):
future = executor.submit(
_scrape_single_page,
sorting_option,
page,
start_date,
end_date,
city,
distance,
categories,
levels,
lat,
lng,
ages,
tournament_types,
national_cup,
)
futures.append((page, future))
# Collect results as they complete
for page, future in futures:
try:
result = future.result(timeout=60) # 60 second timeout per page
if result and result.get("tournaments"):
tournaments = result.get("tournaments", [])
all_tournaments.extend(tournaments)
# logger.info(
# f"Page {page} completed: {len(tournaments)} tournaments"
# )
else:
logger.warning(f"Page {page} returned no results")
except Exception as e:
logger.error(f"Error processing page {page}: {e}")
# logger.info(
# f"Concurrent scraping completed: {len(all_tournaments)} tournaments from {total_pages - 1} remaining pages"
# )
return {
"tournaments": all_tournaments,
"total_results": total_results,
"current_count": len(all_tournaments),
"pages_scraped": total_pages
- 1, # Excluding page 0 which was handled separately
}
def _parse_clubs_ajax_response(json_data):
"""
Parse the clubs AJAX response to match Swift FederalClubResponse structure
"""
try:
# Log the raw response structure to understand what we're getting
# # logger.info(f"Raw clubs response structure: {json_data}")
club_markers = []
total_results = 0
# Try to extract clubs data from different possible response structures
if isinstance(json_data, dict):
# Pattern 1: Direct club_markers array
if "club_markers" in json_data:
clubs_data = json_data["club_markers"]
total_results = json_data.get("nombreResultat", len(clubs_data))
# Pattern 2: Results wrapper
elif "results" in json_data:
results = json_data["results"]
clubs_data = results.get(
"clubs", results.get("items", results.get("club_markers", []))
)
total_results = results.get(
"nombreResultat",
results.get("total", results.get("nb_results", len(clubs_data))),
)
# Pattern 3: Direct array in response
elif "data" in json_data:
clubs_data = json_data["data"]
total_results = len(clubs_data)
# Pattern 4: Response is the clubs array directly
else:
clubs_data = json_data if isinstance(json_data, list) else []
total_results = len(clubs_data)
elif isinstance(json_data, list):
clubs_data = json_data
total_results = len(clubs_data)
else:
logger.error(f"Unexpected response format: {type(json_data)}")
clubs_data = []
total_results = 0
# Parse each club to match ClubMarker structure
for item in clubs_data:
if isinstance(item, dict):
# Extract pratiques array
pratiques = []
if "pratiques" in item:
pratiques = item["pratiques"]
elif "practices" in item:
pratiques = item["practices"]
else:
# Default to PADEL if not specified
pratiques = ["PADEL"]
# Ensure pratiques are uppercase strings
pratiques = [
p.upper() if isinstance(p, str) else str(p).upper()
for p in pratiques
]
club_marker = {
"nom": item.get("nom", item.get("name", "")),
"clubId": str(
item.get("clubId", item.get("id", item.get("code", "")))
),
"ville": item.get("ville", item.get("city", "")),
"distance": str(item.get("distance", "0")),
"terrainPratiqueLibelle": item.get(
"terrainPratiqueLibelle", item.get("courtsInfo", "")
),
"pratiques": pratiques,
"lat": float(item.get("lat", item.get("latitude", 0.0))),
"lng": float(item.get("lng", item.get("longitude", 0.0))),
}
club_markers.append(club_marker)
# logger.info(
# f"Successfully parsed {len(club_markers)} club markers from response"
# )
# Return the response in the format expected by Swift FederalClubResponse
return {
"typeRecherche": "clubs",
"nombreResultat": total_results,
"club_markers": club_markers,
}
except Exception as e:
logger.error(f"Error parsing clubs AJAX response: {e}")
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
def scrape_federal_clubs(
country=None, city="", latitude=None, longitude=None, radius=15, max_workers=5
):
"""
Scrapes FFT federal clubs by extracting data from the HTML response
"""
# logger.info(f"Starting federal clubs scraping for city: {city}, country: {country}")
try:
with sync_playwright() as p:
browser = get_browser_for_environment(p)
page_obj = browser.new_page()
page_obj.set_extra_http_headers(
{
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15"
}
)
# Clean up city name - remove zip code and extra info
clean_city = city
if city:
clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip()
clean_city = clean_city.rstrip(",").strip()
# logger.info(f"Cleaned city name: '{city}' -> '{clean_city}'")
# Build the results URL directly
params = f"ville={clean_city}&pratique=PADEL&distance={int(radius)}&country={country or 'fr'}"
results_url = f"https://tenup.fft.fr/recherche/clubs/resultats?{params}"
# logger.info(f"Requesting results URL: {results_url}")
# Navigate to the results page
page_obj.goto(results_url)
# Wait for the page to load
page_obj.wait_for_timeout(2000)
# Check if we're in queue
if "queue-it.net" in page_obj.url.lower():
logger.warning("Hit Queue-It on results page")
browser.close()
return {
"typeRecherche": "clubs",
"nombreResultat": 0,
"club_markers": [],
}
# Use JavaScript to extract the data directly from the page
extraction_script = """
() => {
try {
// Check if Drupal.settings exists and has the data
if (typeof Drupal !== 'undefined' &&
Drupal.settings &&
Drupal.settings.fft_recherche_club) {
const data = Drupal.settings.fft_recherche_club;
return {
success: true,
typeRecherche: data.typeRecherche || 'club',
total: data.total || 0,
resultat: data.resultat || []
};
}
return {
success: false,
error: 'Drupal.settings.fft_recherche_club not found'
};
} catch (error) {
return {
success: false,
error: error.message
};
}
}
"""
result = page_obj.evaluate(extraction_script)
browser.close()
if result.get("success"):
type_recherche = result.get("typeRecherche", "club")
total = result.get("total", 0)
resultat = result.get("resultat", [])
# logger.info(f"Successfully extracted {total} clubs")
# Convert resultat to club_markers format
club_markers = []
for club in resultat:
club_markers.append(
{
"nom": club.get("nom", ""),
"clubId": club.get("clubId", ""),
"ville": club.get("ville", ""),
"distance": club.get("distance", ""),
"terrainPratiqueLibelle": club.get(
"terrainPratiqueLibelle", ""
),
"pratiques": club.get("pratiques", []),
"lat": club.get("lat", 0.0),
"lng": club.get("lng", 0.0),
}
)
return {
"typeRecherche": type_recherche,
"nombreResultat": total,
"club_markers": club_markers,
}
else:
logger.error(f"Failed to extract data: {result.get('error')}")
return {
"typeRecherche": "clubs",
"nombreResultat": 0,
"club_markers": [],
}
except Exception as e:
logger.error(f"Error in federal clubs scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}")
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
def get_browser_for_environment(playwright_instance):
"""
Get appropriate browser based on environment
- Development (DEBUG=True): Use Firefox (works better on macOS 15+)
- Production (DEBUG=False): Use Chromium (as it was working)
"""
is_development = getattr(settings, "DEBUG", False)
if is_development:
# # logger.info("Development environment detected - using Firefox")
try:
return playwright_instance.firefox.launch(
headless=True, args=["--no-sandbox"]
)
except Exception as firefox_error:
logger.warning(
f"Firefox failed in dev, falling back to Chromium: {firefox_error}"
)
return playwright_instance.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-dev-shm-usage", "--single-process"],
)
else:
# logger.info("Production environment detected - using Chromium")
return playwright_instance.chromium.launch(
headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"]
)