|
|
|
|
@ -6,6 +6,8 @@ from datetime import datetime, timedelta |
|
|
|
|
import json |
|
|
|
|
import traceback |
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
|
|
from django.conf import settings |
|
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
@ -32,7 +34,7 @@ def scrape_fft_club_tournaments( |
|
|
|
|
logger.info(f"Starting Playwright scraping for {club_name}") |
|
|
|
|
try: |
|
|
|
|
with sync_playwright() as p: |
|
|
|
|
browser = p.chromium.launch(headless=True) |
|
|
|
|
browser = get_browser_for_environment(p) |
|
|
|
|
page_obj = browser.new_page() |
|
|
|
|
|
|
|
|
|
page_obj.set_extra_http_headers( |
|
|
|
|
@ -43,10 +45,10 @@ def scrape_fft_club_tournaments( |
|
|
|
|
|
|
|
|
|
# Navigate to FFT |
|
|
|
|
target_url = "https://tenup.fft.fr/recherche/tournois" |
|
|
|
|
logger.info(f"Navigating to: {target_url}") |
|
|
|
|
# logger.info(f"Navigating to: {target_url}") |
|
|
|
|
|
|
|
|
|
page_obj.goto(target_url) |
|
|
|
|
# page_obj.wait_for_timeout(7000) |
|
|
|
|
page_obj.wait_for_timeout(2000) |
|
|
|
|
|
|
|
|
|
current_url = page_obj.url |
|
|
|
|
logger.info(f"Current URL: {current_url}") |
|
|
|
|
@ -64,7 +66,7 @@ def scrape_fft_club_tournaments( |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
form_build_id = form_input.get_attribute("value") |
|
|
|
|
logger.info(f"Extracted form_build_id: {form_build_id}") |
|
|
|
|
# logger.info(f"Extracted form_build_id: {form_build_id}") |
|
|
|
|
|
|
|
|
|
# Build parameters |
|
|
|
|
date_component = "" |
|
|
|
|
@ -83,7 +85,7 @@ def scrape_fft_club_tournaments( |
|
|
|
|
|
|
|
|
|
params = f"recherche_type=club&club[autocomplete][value_container][value_field]={club_code_clean}&club[autocomplete][value_container][label_field]={club_name_encoded}&pratique=PADEL{date_component}&page={page}&sort=dateDebut+asc&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" |
|
|
|
|
|
|
|
|
|
logger.info(f"AJAX Parameters: {params}") |
|
|
|
|
# logger.info(f"AJAX Parameters: {params}") |
|
|
|
|
|
|
|
|
|
# Make AJAX request and capture the full response |
|
|
|
|
ajax_script = f""" |
|
|
|
|
@ -118,24 +120,24 @@ def scrape_fft_club_tournaments( |
|
|
|
|
}} |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
logger.info("Making AJAX request...") |
|
|
|
|
# logger.info("Making AJAX request...") |
|
|
|
|
result = page_obj.evaluate(ajax_script) |
|
|
|
|
|
|
|
|
|
browser.close() |
|
|
|
|
|
|
|
|
|
# Print the full response for debugging |
|
|
|
|
logger.info(f"AJAX Response Status: {result.get('status')}") |
|
|
|
|
logger.info(f"AJAX Response Success: {result.get('success')}") |
|
|
|
|
# logger.info(f"AJAX Response Status: {result.get('status')}") |
|
|
|
|
# logger.info(f"AJAX Response Success: {result.get('success')}") |
|
|
|
|
|
|
|
|
|
if result.get("success"): |
|
|
|
|
response_text = result.get("responseText", "") |
|
|
|
|
logger.info(f"Raw Response Length: {len(response_text)}") |
|
|
|
|
# logger.info(f"Raw Response Length: {len(response_text)}") |
|
|
|
|
# logger.info(f"Raw Response (first 500 chars): {response_text[:500]}") |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
# Try to parse as JSON |
|
|
|
|
json_data = json.loads(response_text) |
|
|
|
|
logger.info(f"JSON Response Type: {type(json_data)}") |
|
|
|
|
# logger.info(f"JSON Response Type: {type(json_data)}") |
|
|
|
|
# logger.info(f"JSON Response: {json.dumps(json_data, indent=2, default=str)}") |
|
|
|
|
|
|
|
|
|
# Now try to parse it |
|
|
|
|
@ -164,7 +166,7 @@ def scrape_fft_club_tournaments_all_pages( |
|
|
|
|
""" |
|
|
|
|
Scrapes all pages of FFT tournaments for a specific club |
|
|
|
|
""" |
|
|
|
|
logger.info(f"Starting complete tournament scraping for {club_name}") |
|
|
|
|
# logger.info(f"Starting complete tournament scraping for {club_name}") |
|
|
|
|
|
|
|
|
|
all_tournaments = [] |
|
|
|
|
page = 0 |
|
|
|
|
@ -237,7 +239,7 @@ def _parse_ajax_response(commands): |
|
|
|
|
# Check for alert commands (maintenance mode) |
|
|
|
|
for command in commands: |
|
|
|
|
if command.get("command") == "alert": |
|
|
|
|
logger.warning("Maintenance mode detected") |
|
|
|
|
# logger.warning("Maintenance mode detected") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
# Find the command with results |
|
|
|
|
@ -245,7 +247,7 @@ def _parse_ajax_response(commands): |
|
|
|
|
for command in commands: |
|
|
|
|
if command.get("command") == "recherche_tournois_update": |
|
|
|
|
result_command = command |
|
|
|
|
logger.info("Found recherche_tournois_update command!") |
|
|
|
|
# logger.info("Found recherche_tournois_update command!") |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
if result_command and result_command.get("results"): |
|
|
|
|
@ -253,7 +255,7 @@ def _parse_ajax_response(commands): |
|
|
|
|
items = results.get("items", []) |
|
|
|
|
total_results = results.get("nb_results", 0) |
|
|
|
|
|
|
|
|
|
logger.info(f"Processing {len(items)} tournaments from results") |
|
|
|
|
# logger.info(f"Processing {len(items)} tournaments from results") |
|
|
|
|
|
|
|
|
|
for item in items: |
|
|
|
|
# Parse dates - they're already in the correct format |
|
|
|
|
@ -331,9 +333,9 @@ def _parse_ajax_response(commands): |
|
|
|
|
|
|
|
|
|
tournaments.append(tournament) |
|
|
|
|
|
|
|
|
|
logger.info( |
|
|
|
|
f"Successfully parsed {len(tournaments)} tournaments from response" |
|
|
|
|
) |
|
|
|
|
# logger.info( |
|
|
|
|
# f"Successfully parsed {len(tournaments)} tournaments from response" |
|
|
|
|
# ) |
|
|
|
|
return { |
|
|
|
|
"tournaments": tournaments, |
|
|
|
|
"total_results": total_results, |
|
|
|
|
@ -371,7 +373,7 @@ def scrape_fft_all_tournaments( |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
with sync_playwright() as p: |
|
|
|
|
browser = p.chromium.launch(headless=True) |
|
|
|
|
browser = get_browser_for_environment(p) |
|
|
|
|
page_obj = browser.new_page() |
|
|
|
|
|
|
|
|
|
page_obj.set_extra_http_headers( |
|
|
|
|
@ -382,28 +384,28 @@ def scrape_fft_all_tournaments( |
|
|
|
|
|
|
|
|
|
# Navigate to FFT |
|
|
|
|
target_url = "https://tenup.fft.fr/recherche/tournois" |
|
|
|
|
logger.info(f"Navigating to: {target_url}") |
|
|
|
|
# logger.info(f"Navigating to: {target_url}") |
|
|
|
|
|
|
|
|
|
page_obj.goto(target_url) |
|
|
|
|
# page_obj.wait_for_timeout(7000) |
|
|
|
|
page_obj.wait_for_timeout(500) |
|
|
|
|
|
|
|
|
|
current_url = page_obj.url |
|
|
|
|
logger.info(f"Current URL: {current_url}") |
|
|
|
|
# logger.info(f"Current URL: {current_url}") |
|
|
|
|
|
|
|
|
|
if "queue-it.net" in current_url.lower(): |
|
|
|
|
logger.warning("Still in Queue-It") |
|
|
|
|
# logger.warning("Still in Queue-It") |
|
|
|
|
browser.close() |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
# Extract form_build_id |
|
|
|
|
form_input = page_obj.query_selector('input[name="form_build_id"]') |
|
|
|
|
if not form_input: |
|
|
|
|
logger.error("Could not find form_build_id") |
|
|
|
|
# logger.error("Could not find form_build_id") |
|
|
|
|
browser.close() |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
form_build_id = form_input.get_attribute("value") |
|
|
|
|
logger.info(f"Extracted form_build_id: {form_build_id}") |
|
|
|
|
# logger.info(f"Extracted form_build_id: {form_build_id}") |
|
|
|
|
|
|
|
|
|
# Build parameters - EXACT same pattern as club function |
|
|
|
|
date_component = "" |
|
|
|
|
@ -422,32 +424,32 @@ def scrape_fft_all_tournaments( |
|
|
|
|
|
|
|
|
|
# Add categories filter |
|
|
|
|
if categories: |
|
|
|
|
logger.info(f"Adding categories filter: {categories}") |
|
|
|
|
# logger.info(f"Adding categories filter: {categories}") |
|
|
|
|
for category in categories: |
|
|
|
|
filter_params += f"&epreuve[{category}]={category}" |
|
|
|
|
|
|
|
|
|
# Add levels filter |
|
|
|
|
if levels: |
|
|
|
|
logger.info(f"Adding levels filter: {levels}") |
|
|
|
|
# logger.info(f"Adding levels filter: {levels}") |
|
|
|
|
for level in levels: |
|
|
|
|
filter_params += f"&categorie_tournoi[{level}]={level}" |
|
|
|
|
|
|
|
|
|
# Add ages filter |
|
|
|
|
if ages: |
|
|
|
|
logger.info(f"Adding ages filter: {ages}") |
|
|
|
|
# logger.info(f"Adding ages filter: {ages}") |
|
|
|
|
for age in ages: |
|
|
|
|
filter_params += f"&categorie_age[{age}]={age}" |
|
|
|
|
|
|
|
|
|
# Add types filter |
|
|
|
|
if tournament_types: |
|
|
|
|
logger.info(f"Adding types filter: {tournament_types}") |
|
|
|
|
# logger.info(f"Adding types filter: {tournament_types}") |
|
|
|
|
for t_type in tournament_types: |
|
|
|
|
capitalized_type = t_type.capitalize() |
|
|
|
|
filter_params += f"&type[{capitalized_type}]={capitalized_type}" |
|
|
|
|
|
|
|
|
|
# Add national cup filter |
|
|
|
|
if national_cup: |
|
|
|
|
logger.info("Adding national cup filter") |
|
|
|
|
# logger.info("Adding national cup filter") |
|
|
|
|
filter_params += "&tournoi_npc=1" |
|
|
|
|
|
|
|
|
|
# Fix the sorting parameter |
|
|
|
|
@ -477,7 +479,7 @@ def scrape_fft_all_tournaments( |
|
|
|
|
# Default to ligue search if no city provided |
|
|
|
|
params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" |
|
|
|
|
|
|
|
|
|
logger.info(f"AJAX Parameters: {params}") |
|
|
|
|
# logger.info(f"AJAX Parameters: {params}") |
|
|
|
|
|
|
|
|
|
# Make AJAX request and capture the full response - EXACT same as club function |
|
|
|
|
ajax_script = f""" |
|
|
|
|
@ -512,23 +514,23 @@ def scrape_fft_all_tournaments( |
|
|
|
|
}} |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
logger.info("Making AJAX request...") |
|
|
|
|
# logger.info("Making AJAX request...") |
|
|
|
|
result = page_obj.evaluate(ajax_script) |
|
|
|
|
|
|
|
|
|
browser.close() |
|
|
|
|
|
|
|
|
|
# Print the full response for debugging - EXACT same as club function |
|
|
|
|
logger.info(f"AJAX Response Status: {result.get('status')}") |
|
|
|
|
logger.info(f"AJAX Response Success: {result.get('success')}") |
|
|
|
|
# logger.info(f"AJAX Response Status: {result.get('status')}") |
|
|
|
|
# logger.info(f"AJAX Response Success: {result.get('success')}") |
|
|
|
|
|
|
|
|
|
if result.get("success"): |
|
|
|
|
response_text = result.get("responseText", "") |
|
|
|
|
logger.info(f"Raw Response Length: {len(response_text)}") |
|
|
|
|
# logger.info(f"Raw Response Length: {len(response_text)}") |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
# Try to parse as JSON |
|
|
|
|
json_data = json.loads(response_text) |
|
|
|
|
logger.info(f"JSON Response Type: {type(json_data)}") |
|
|
|
|
# logger.info(f"JSON Response Type: {type(json_data)}") |
|
|
|
|
|
|
|
|
|
# Now try to parse it - EXACT same as club function |
|
|
|
|
parsed_result = _parse_ajax_response(json_data) |
|
|
|
|
@ -536,11 +538,11 @@ def scrape_fft_all_tournaments( |
|
|
|
|
return parsed_result |
|
|
|
|
|
|
|
|
|
except json.JSONDecodeError as json_error: |
|
|
|
|
logger.error(f"JSON Parse Error: {json_error}") |
|
|
|
|
logger.error(f"Response text: {response_text}") |
|
|
|
|
# logger.error(f"JSON Parse Error: {json_error}") |
|
|
|
|
# logger.error(f"Response text: {response_text}") |
|
|
|
|
return None |
|
|
|
|
else: |
|
|
|
|
logger.error(f"AJAX request failed: {result.get('error')}") |
|
|
|
|
# logger.error(f"AJAX request failed: {result.get('error')}") |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
@ -551,73 +553,277 @@ def scrape_fft_all_tournaments( |
|
|
|
|
|
|
|
|
|
def get_umpire_data(tournament_id): |
|
|
|
|
""" |
|
|
|
|
Fast umpire data extraction using Playwright (optimized for speed) |
|
|
|
|
Umpire data extraction with improved Queue-It handling |
|
|
|
|
""" |
|
|
|
|
logger.info(f"Getting umpire data for tournament {tournament_id}") |
|
|
|
|
# logger.info(f"Getting umpire data for tournament {tournament_id}") |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
with sync_playwright() as p: |
|
|
|
|
browser = p.chromium.launch( |
|
|
|
|
headless=True, |
|
|
|
|
args=[ |
|
|
|
|
"--no-sandbox", |
|
|
|
|
"--disable-dev-shm-usage", |
|
|
|
|
"--disable-images", # Don't load images |
|
|
|
|
"--disable-javascript", # Disable JS for faster loading |
|
|
|
|
"--disable-plugins", |
|
|
|
|
"--disable-extensions", |
|
|
|
|
], |
|
|
|
|
) |
|
|
|
|
browser = get_browser_for_environment(p) |
|
|
|
|
page = browser.new_page() |
|
|
|
|
|
|
|
|
|
# Navigate to tournament page quickly |
|
|
|
|
# Navigate to tournament page |
|
|
|
|
url = f"https://tenup.fft.fr/tournoi/{tournament_id}" |
|
|
|
|
logger.info(f"Navigating to tournament page: {url}") |
|
|
|
|
# logger.info(f"Navigating to tournament page: {url}") |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
# Fast navigation - don't wait for everything to load |
|
|
|
|
page.goto(url, timeout=15000, wait_until="domcontentloaded") |
|
|
|
|
# Navigate with reasonable timeout |
|
|
|
|
page.goto(url, timeout=30000, wait_until="domcontentloaded") |
|
|
|
|
|
|
|
|
|
# Quick Queue-It check |
|
|
|
|
# Enhanced Queue-It handling (similar to tournament search) |
|
|
|
|
if "queue-it.net" in page.url.lower(): |
|
|
|
|
logger.warning("Hit Queue-It on tournament page") |
|
|
|
|
browser.close() |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
# Extract data using the fastest method - regex on HTML content |
|
|
|
|
# logger.warning(f"Queue-It detected for tournament {tournament_id}") |
|
|
|
|
|
|
|
|
|
# Wait strategy based on environment |
|
|
|
|
max_queue_wait = ( |
|
|
|
|
120000 if not settings.DEBUG else 30000 |
|
|
|
|
) # 2 min prod, 30s dev |
|
|
|
|
check_interval = 1000 # 10 seconds |
|
|
|
|
elapsed_time = 0 |
|
|
|
|
|
|
|
|
|
# logger.info( |
|
|
|
|
# f"Waiting in queue for umpire data (max {max_queue_wait / 1000}s)..." |
|
|
|
|
# ) |
|
|
|
|
|
|
|
|
|
while ( |
|
|
|
|
elapsed_time < max_queue_wait |
|
|
|
|
and "queue-it.net" in page.url.lower() |
|
|
|
|
): |
|
|
|
|
# logger.info( |
|
|
|
|
# f"Umpire queue wait: {elapsed_time / 1000}s elapsed" |
|
|
|
|
# ) |
|
|
|
|
page.wait_for_timeout(check_interval) |
|
|
|
|
elapsed_time += check_interval |
|
|
|
|
|
|
|
|
|
# Check if we've been redirected |
|
|
|
|
if "queue-it.net" not in page.url.lower(): |
|
|
|
|
# logger.info( |
|
|
|
|
# "Successfully passed through queue for umpire data!" |
|
|
|
|
# ) |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
# If still in queue, try fallback or return None |
|
|
|
|
if "queue-it.net" in page.url.lower(): |
|
|
|
|
# logger.warning( |
|
|
|
|
# "Umpire queue timeout - trying requests fallback" |
|
|
|
|
# ) |
|
|
|
|
browser.close() |
|
|
|
|
return _get_umpire_data_requests_fallback(tournament_id) |
|
|
|
|
|
|
|
|
|
# Wait for page to load properly |
|
|
|
|
page.wait_for_load_state("networkidle", timeout=20000) |
|
|
|
|
|
|
|
|
|
# Extract data using multiple strategies |
|
|
|
|
html_content = page.content() |
|
|
|
|
|
|
|
|
|
# Extract name |
|
|
|
|
name_pattern = ( |
|
|
|
|
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' |
|
|
|
|
) |
|
|
|
|
name_match = re.search(name_pattern, html_content) |
|
|
|
|
name = name_match.group(1).strip() if name_match else None |
|
|
|
|
|
|
|
|
|
# Extract email |
|
|
|
|
email_pattern = r'mailto:([^"]+)"' |
|
|
|
|
email_match = re.search(email_pattern, html_content) |
|
|
|
|
email = email_match.group(1) if email_match else None |
|
|
|
|
# Strategy 1: Try existing regex patterns |
|
|
|
|
name, email, phone = _extract_umpire_with_regex(html_content) |
|
|
|
|
|
|
|
|
|
# Extract phone |
|
|
|
|
phone_pattern = r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>' |
|
|
|
|
phone_match = re.search(phone_pattern, html_content) |
|
|
|
|
phone = phone_match.group(1).strip() if phone_match else None |
|
|
|
|
# Strategy 2: If regex fails, try DOM selectors |
|
|
|
|
if not name and not email and not phone: |
|
|
|
|
name, email, phone = _extract_umpire_with_selectors(page) |
|
|
|
|
|
|
|
|
|
browser.close() |
|
|
|
|
|
|
|
|
|
logger.info( |
|
|
|
|
f"Extracted umpire data: name={name}, email={email}, phone={phone}" |
|
|
|
|
) |
|
|
|
|
return name, email, phone |
|
|
|
|
if name or email or phone: |
|
|
|
|
# logger.info( |
|
|
|
|
# f"Successfully extracted umpire data: name={name}, email={email}, phone={phone}" |
|
|
|
|
# ) |
|
|
|
|
return name, email, phone |
|
|
|
|
else: |
|
|
|
|
logger.warning( |
|
|
|
|
f"No umpire data found for tournament {tournament_id}" |
|
|
|
|
) |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
except Exception as page_error: |
|
|
|
|
logger.error(f"Error loading tournament page: {page_error}") |
|
|
|
|
logger.error( |
|
|
|
|
f"Error loading tournament page {tournament_id}: {page_error}" |
|
|
|
|
) |
|
|
|
|
browser.close() |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
# Try requests fallback |
|
|
|
|
logger.info("Trying requests fallback after Playwright error") |
|
|
|
|
return _get_umpire_data_requests_fallback(tournament_id) |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logger.error(f"Error in umpire data extraction for {tournament_id}: {e}") |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_umpire_with_regex(html_content): |
|
|
|
|
""" |
|
|
|
|
Extract umpire data using regex patterns |
|
|
|
|
""" |
|
|
|
|
# Extract name |
|
|
|
|
name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<' |
|
|
|
|
name_match = re.search(name_pattern, html_content) |
|
|
|
|
name = name_match.group(1).strip() if name_match else None |
|
|
|
|
|
|
|
|
|
# Extract email |
|
|
|
|
email_pattern = r'mailto:([^"]+)"' |
|
|
|
|
email_match = re.search(email_pattern, html_content) |
|
|
|
|
email = email_match.group(1) if email_match else None |
|
|
|
|
|
|
|
|
|
# Extract phone - try multiple patterns |
|
|
|
|
phone_patterns = [ |
|
|
|
|
r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>', |
|
|
|
|
r"(\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{2})", |
|
|
|
|
r"(\d{10})", |
|
|
|
|
r"(\+33\s?\d{1}\s?\d{2}\s?\d{2}\s?\d{2}\s?\d{2})", |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
phone = None |
|
|
|
|
for pattern in phone_patterns: |
|
|
|
|
phone_match = re.search(pattern, html_content) |
|
|
|
|
if phone_match: |
|
|
|
|
phone = phone_match.group(1).strip() |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
return name, email, phone |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_umpire_with_selectors(page): |
|
|
|
|
""" |
|
|
|
|
Extract umpire data using DOM selectors as fallback |
|
|
|
|
""" |
|
|
|
|
name = None |
|
|
|
|
email = None |
|
|
|
|
phone = None |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
# Try different selectors for name |
|
|
|
|
name_selectors = [ |
|
|
|
|
".tournoi-detail-page-inscription-responsable-title", |
|
|
|
|
'[class*="responsable-title"]', |
|
|
|
|
'[class*="umpire-name"]', |
|
|
|
|
'h3:has-text("Responsable")', |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
for selector in name_selectors: |
|
|
|
|
try: |
|
|
|
|
element = page.query_selector(selector) |
|
|
|
|
if element: |
|
|
|
|
name = element.inner_text().strip() |
|
|
|
|
if name: |
|
|
|
|
break |
|
|
|
|
except: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
# Try different selectors for email |
|
|
|
|
email_selectors = [ |
|
|
|
|
'a[href^="mailto:"]', |
|
|
|
|
'[class*="email"]', |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
for selector in email_selectors: |
|
|
|
|
try: |
|
|
|
|
element = page.query_selector(selector) |
|
|
|
|
if element: |
|
|
|
|
href = element.get_attribute("href") |
|
|
|
|
if href and href.startswith("mailto:"): |
|
|
|
|
email = href.replace("mailto:", "") |
|
|
|
|
break |
|
|
|
|
text = element.inner_text().strip() |
|
|
|
|
if "@" in text: |
|
|
|
|
email = text |
|
|
|
|
break |
|
|
|
|
except: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
# Try different selectors for phone |
|
|
|
|
phone_selectors = [ |
|
|
|
|
".details-bloc", |
|
|
|
|
'[class*="phone"]', |
|
|
|
|
'[class*="telephone"]', |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
for selector in phone_selectors: |
|
|
|
|
try: |
|
|
|
|
element = page.query_selector(selector) |
|
|
|
|
if element: |
|
|
|
|
text = element.inner_text().strip() |
|
|
|
|
# Check if text looks like a phone number |
|
|
|
|
|
|
|
|
|
if re.match(r"[\d\s\.\+\-\(\)]{8,}", text): |
|
|
|
|
phone = text |
|
|
|
|
break |
|
|
|
|
except: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logger.warning(f"Error in selector-based extraction: {e}") |
|
|
|
|
|
|
|
|
|
return name, email, phone |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_umpire_data_requests_fallback(tournament_id): |
|
|
|
|
""" |
|
|
|
|
Enhanced fallback method using requests |
|
|
|
|
""" |
|
|
|
|
logger.info(f"Using requests fallback for tournament {tournament_id}") |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
headers = { |
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", |
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
|
|
|
"Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8", |
|
|
|
|
"Accept-Encoding": "gzip, deflate, br", |
|
|
|
|
"Connection": "keep-alive", |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
url = f"https://tenup.fft.fr/tournoi/{tournament_id}" |
|
|
|
|
response = requests.get(url, headers=headers, timeout=15) |
|
|
|
|
|
|
|
|
|
if "queue-it.net" in response.url: |
|
|
|
|
# logger.warning("Requests fallback also hit Queue-It") |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
if response.status_code != 200: |
|
|
|
|
# logger.error(f"Requests fallback failed: {response.status_code}") |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
# Try regex extraction on requests response |
|
|
|
|
name, email, phone = _extract_umpire_with_regex(response.text) |
|
|
|
|
|
|
|
|
|
# If regex fails, try BeautifulSoup |
|
|
|
|
if not name and not email and not phone: |
|
|
|
|
try: |
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
|
|
|
|
|
|
|
|
# Try to find name |
|
|
|
|
name_elements = soup.find_all( |
|
|
|
|
class_=lambda x: x and "responsable" in x.lower() |
|
|
|
|
) |
|
|
|
|
if name_elements: |
|
|
|
|
name = name_elements[0].get_text().strip() |
|
|
|
|
|
|
|
|
|
# Try to find email |
|
|
|
|
email_links = soup.find_all( |
|
|
|
|
"a", href=lambda x: x and x.startswith("mailto:") |
|
|
|
|
) |
|
|
|
|
if email_links: |
|
|
|
|
email = email_links[0]["href"].replace("mailto:", "") |
|
|
|
|
|
|
|
|
|
# Try to find phone in various elements |
|
|
|
|
for element in soup.find_all(text=True): |
|
|
|
|
if re.search( |
|
|
|
|
r"\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}", |
|
|
|
|
str(element), |
|
|
|
|
): |
|
|
|
|
phone = str(element).strip() |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
except Exception as soup_error: |
|
|
|
|
logger.warning(f"BeautifulSoup parsing failed: {soup_error}") |
|
|
|
|
|
|
|
|
|
logger.info( |
|
|
|
|
f"Requests fallback result: name={name}, email={email}, phone={phone}" |
|
|
|
|
) |
|
|
|
|
return name, email, phone |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
logger.error(f"Error in umpire data extraction: {e}") |
|
|
|
|
logger.error(f"Requests fallback error: {e}") |
|
|
|
|
return None, None, None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -715,7 +921,7 @@ def scrape_fft_all_tournaments_concurrent( |
|
|
|
|
ages=None, |
|
|
|
|
tournament_types=None, |
|
|
|
|
national_cup=False, |
|
|
|
|
max_workers=5, |
|
|
|
|
max_workers=10, |
|
|
|
|
): |
|
|
|
|
""" |
|
|
|
|
Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end) |
|
|
|
|
@ -777,8 +983,8 @@ def scrape_fft_all_tournaments_concurrent( |
|
|
|
|
|
|
|
|
|
# Scrape all remaining pages concurrently (pages 1 to total_pages-1) |
|
|
|
|
all_tournaments = [] |
|
|
|
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor: |
|
|
|
|
max_concurrent = min(total_pages, 20) |
|
|
|
|
with ThreadPoolExecutor(max_workers=max_concurrent) as executor: |
|
|
|
|
futures = [] |
|
|
|
|
|
|
|
|
|
for page in range(1, total_pages): |
|
|
|
|
@ -834,7 +1040,7 @@ def _parse_clubs_ajax_response(json_data): |
|
|
|
|
""" |
|
|
|
|
try: |
|
|
|
|
# Log the raw response structure to understand what we're getting |
|
|
|
|
logger.info(f"Raw clubs response structure: {json_data}") |
|
|
|
|
# logger.info(f"Raw clubs response structure: {json_data}") |
|
|
|
|
|
|
|
|
|
club_markers = [] |
|
|
|
|
total_results = 0 |
|
|
|
|
@ -937,7 +1143,7 @@ def scrape_federal_clubs( |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
with sync_playwright() as p: |
|
|
|
|
browser = p.chromium.launch(headless=True) |
|
|
|
|
browser = get_browser_for_environment(p) |
|
|
|
|
page_obj = browser.new_page() |
|
|
|
|
|
|
|
|
|
page_obj.set_extra_http_headers( |
|
|
|
|
@ -949,8 +1155,6 @@ def scrape_federal_clubs( |
|
|
|
|
# Clean up city name - remove zip code and extra info |
|
|
|
|
clean_city = city |
|
|
|
|
if city: |
|
|
|
|
import re |
|
|
|
|
|
|
|
|
|
clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip() |
|
|
|
|
clean_city = clean_city.rstrip(",").strip() |
|
|
|
|
|
|
|
|
|
@ -966,7 +1170,7 @@ def scrape_federal_clubs( |
|
|
|
|
page_obj.goto(results_url) |
|
|
|
|
|
|
|
|
|
# Wait for the page to load |
|
|
|
|
page_obj.wait_for_timeout(3000) |
|
|
|
|
page_obj.wait_for_timeout(2000) |
|
|
|
|
|
|
|
|
|
# Check if we're in queue |
|
|
|
|
if "queue-it.net" in page_obj.url.lower(): |
|
|
|
|
@ -1056,3 +1260,32 @@ def scrape_federal_clubs( |
|
|
|
|
logger.error(f"Error in federal clubs scraping: {e}") |
|
|
|
|
logger.error(f"Traceback: {traceback.format_exc()}") |
|
|
|
|
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_browser_for_environment(playwright_instance): |
|
|
|
|
""" |
|
|
|
|
Get appropriate browser based on environment |
|
|
|
|
- Development (DEBUG=True): Use Firefox (works better on macOS 15+) |
|
|
|
|
- Production (DEBUG=False): Use Chromium (as it was working) |
|
|
|
|
""" |
|
|
|
|
is_development = getattr(settings, "DEBUG", False) |
|
|
|
|
|
|
|
|
|
if is_development: |
|
|
|
|
# logger.info("Development environment detected - using Firefox") |
|
|
|
|
try: |
|
|
|
|
return playwright_instance.firefox.launch( |
|
|
|
|
headless=True, args=["--no-sandbox"] |
|
|
|
|
) |
|
|
|
|
except Exception as firefox_error: |
|
|
|
|
logger.warning( |
|
|
|
|
f"Firefox failed in dev, falling back to Chromium: {firefox_error}" |
|
|
|
|
) |
|
|
|
|
return playwright_instance.chromium.launch( |
|
|
|
|
headless=True, |
|
|
|
|
args=["--no-sandbox", "--disable-dev-shm-usage", "--single-process"], |
|
|
|
|
) |
|
|
|
|
else: |
|
|
|
|
logger.info("Production environment detected - using Chromium") |
|
|
|
|
return playwright_instance.chromium.launch( |
|
|
|
|
headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"] |
|
|
|
|
) |
|
|
|
|
|