Refactor Playwright scraping with environment-specific browser and

Queue-It handling
mailing
Razmig Sarkissian 2 months ago
parent a7cbf4c6a6
commit 2c47025a77
  1. 415
      api/utils.py

@ -6,6 +6,8 @@ from datetime import datetime, timedelta
import json import json
import traceback import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from django.conf import settings
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -32,7 +34,7 @@ def scrape_fft_club_tournaments(
logger.info(f"Starting Playwright scraping for {club_name}") logger.info(f"Starting Playwright scraping for {club_name}")
try: try:
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch(headless=True) browser = get_browser_for_environment(p)
page_obj = browser.new_page() page_obj = browser.new_page()
page_obj.set_extra_http_headers( page_obj.set_extra_http_headers(
@ -43,10 +45,10 @@ def scrape_fft_club_tournaments(
# Navigate to FFT # Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois" target_url = "https://tenup.fft.fr/recherche/tournois"
logger.info(f"Navigating to: {target_url}") # logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url) page_obj.goto(target_url)
# page_obj.wait_for_timeout(7000) page_obj.wait_for_timeout(2000)
current_url = page_obj.url current_url = page_obj.url
logger.info(f"Current URL: {current_url}") logger.info(f"Current URL: {current_url}")
@ -64,7 +66,7 @@ def scrape_fft_club_tournaments(
return None return None
form_build_id = form_input.get_attribute("value") form_build_id = form_input.get_attribute("value")
logger.info(f"Extracted form_build_id: {form_build_id}") # logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters # Build parameters
date_component = "" date_component = ""
@ -83,7 +85,7 @@ def scrape_fft_club_tournaments(
params = f"recherche_type=club&club[autocomplete][value_container][value_field]={club_code_clean}&club[autocomplete][value_container][label_field]={club_name_encoded}&pratique=PADEL{date_component}&page={page}&sort=dateDebut+asc&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" params = f"recherche_type=club&club[autocomplete][value_container][value_field]={club_code_clean}&club[autocomplete][value_container][label_field]={club_name_encoded}&pratique=PADEL{date_component}&page={page}&sort=dateDebut+asc&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
logger.info(f"AJAX Parameters: {params}") # logger.info(f"AJAX Parameters: {params}")
# Make AJAX request and capture the full response # Make AJAX request and capture the full response
ajax_script = f""" ajax_script = f"""
@ -118,24 +120,24 @@ def scrape_fft_club_tournaments(
}} }}
""" """
logger.info("Making AJAX request...") # logger.info("Making AJAX request...")
result = page_obj.evaluate(ajax_script) result = page_obj.evaluate(ajax_script)
browser.close() browser.close()
# Print the full response for debugging # Print the full response for debugging
logger.info(f"AJAX Response Status: {result.get('status')}") # logger.info(f"AJAX Response Status: {result.get('status')}")
logger.info(f"AJAX Response Success: {result.get('success')}") # logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get("success"): if result.get("success"):
response_text = result.get("responseText", "") response_text = result.get("responseText", "")
logger.info(f"Raw Response Length: {len(response_text)}") # logger.info(f"Raw Response Length: {len(response_text)}")
# logger.info(f"Raw Response (first 500 chars): {response_text[:500]}") # logger.info(f"Raw Response (first 500 chars): {response_text[:500]}")
try: try:
# Try to parse as JSON # Try to parse as JSON
json_data = json.loads(response_text) json_data = json.loads(response_text)
logger.info(f"JSON Response Type: {type(json_data)}") # logger.info(f"JSON Response Type: {type(json_data)}")
# logger.info(f"JSON Response: {json.dumps(json_data, indent=2, default=str)}") # logger.info(f"JSON Response: {json.dumps(json_data, indent=2, default=str)}")
# Now try to parse it # Now try to parse it
@ -164,7 +166,7 @@ def scrape_fft_club_tournaments_all_pages(
""" """
Scrapes all pages of FFT tournaments for a specific club Scrapes all pages of FFT tournaments for a specific club
""" """
logger.info(f"Starting complete tournament scraping for {club_name}") # logger.info(f"Starting complete tournament scraping for {club_name}")
all_tournaments = [] all_tournaments = []
page = 0 page = 0
@ -237,7 +239,7 @@ def _parse_ajax_response(commands):
# Check for alert commands (maintenance mode) # Check for alert commands (maintenance mode)
for command in commands: for command in commands:
if command.get("command") == "alert": if command.get("command") == "alert":
logger.warning("Maintenance mode detected") # logger.warning("Maintenance mode detected")
return None return None
# Find the command with results # Find the command with results
@ -245,7 +247,7 @@ def _parse_ajax_response(commands):
for command in commands: for command in commands:
if command.get("command") == "recherche_tournois_update": if command.get("command") == "recherche_tournois_update":
result_command = command result_command = command
logger.info("Found recherche_tournois_update command!") # logger.info("Found recherche_tournois_update command!")
break break
if result_command and result_command.get("results"): if result_command and result_command.get("results"):
@ -253,7 +255,7 @@ def _parse_ajax_response(commands):
items = results.get("items", []) items = results.get("items", [])
total_results = results.get("nb_results", 0) total_results = results.get("nb_results", 0)
logger.info(f"Processing {len(items)} tournaments from results") # logger.info(f"Processing {len(items)} tournaments from results")
for item in items: for item in items:
# Parse dates - they're already in the correct format # Parse dates - they're already in the correct format
@ -331,9 +333,9 @@ def _parse_ajax_response(commands):
tournaments.append(tournament) tournaments.append(tournament)
logger.info( # logger.info(
f"Successfully parsed {len(tournaments)} tournaments from response" # f"Successfully parsed {len(tournaments)} tournaments from response"
) # )
return { return {
"tournaments": tournaments, "tournaments": tournaments,
"total_results": total_results, "total_results": total_results,
@ -371,7 +373,7 @@ def scrape_fft_all_tournaments(
try: try:
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch(headless=True) browser = get_browser_for_environment(p)
page_obj = browser.new_page() page_obj = browser.new_page()
page_obj.set_extra_http_headers( page_obj.set_extra_http_headers(
@ -382,28 +384,28 @@ def scrape_fft_all_tournaments(
# Navigate to FFT # Navigate to FFT
target_url = "https://tenup.fft.fr/recherche/tournois" target_url = "https://tenup.fft.fr/recherche/tournois"
logger.info(f"Navigating to: {target_url}") # logger.info(f"Navigating to: {target_url}")
page_obj.goto(target_url) page_obj.goto(target_url)
# page_obj.wait_for_timeout(7000) page_obj.wait_for_timeout(500)
current_url = page_obj.url current_url = page_obj.url
logger.info(f"Current URL: {current_url}") # logger.info(f"Current URL: {current_url}")
if "queue-it.net" in current_url.lower(): if "queue-it.net" in current_url.lower():
logger.warning("Still in Queue-It") # logger.warning("Still in Queue-It")
browser.close() browser.close()
return None return None
# Extract form_build_id # Extract form_build_id
form_input = page_obj.query_selector('input[name="form_build_id"]') form_input = page_obj.query_selector('input[name="form_build_id"]')
if not form_input: if not form_input:
logger.error("Could not find form_build_id") # logger.error("Could not find form_build_id")
browser.close() browser.close()
return None return None
form_build_id = form_input.get_attribute("value") form_build_id = form_input.get_attribute("value")
logger.info(f"Extracted form_build_id: {form_build_id}") # logger.info(f"Extracted form_build_id: {form_build_id}")
# Build parameters - EXACT same pattern as club function # Build parameters - EXACT same pattern as club function
date_component = "" date_component = ""
@ -422,32 +424,32 @@ def scrape_fft_all_tournaments(
# Add categories filter # Add categories filter
if categories: if categories:
logger.info(f"Adding categories filter: {categories}") # logger.info(f"Adding categories filter: {categories}")
for category in categories: for category in categories:
filter_params += f"&epreuve[{category}]={category}" filter_params += f"&epreuve[{category}]={category}"
# Add levels filter # Add levels filter
if levels: if levels:
logger.info(f"Adding levels filter: {levels}") # logger.info(f"Adding levels filter: {levels}")
for level in levels: for level in levels:
filter_params += f"&categorie_tournoi[{level}]={level}" filter_params += f"&categorie_tournoi[{level}]={level}"
# Add ages filter # Add ages filter
if ages: if ages:
logger.info(f"Adding ages filter: {ages}") # logger.info(f"Adding ages filter: {ages}")
for age in ages: for age in ages:
filter_params += f"&categorie_age[{age}]={age}" filter_params += f"&categorie_age[{age}]={age}"
# Add types filter # Add types filter
if tournament_types: if tournament_types:
logger.info(f"Adding types filter: {tournament_types}") # logger.info(f"Adding types filter: {tournament_types}")
for t_type in tournament_types: for t_type in tournament_types:
capitalized_type = t_type.capitalize() capitalized_type = t_type.capitalize()
filter_params += f"&type[{capitalized_type}]={capitalized_type}" filter_params += f"&type[{capitalized_type}]={capitalized_type}"
# Add national cup filter # Add national cup filter
if national_cup: if national_cup:
logger.info("Adding national cup filter") # logger.info("Adding national cup filter")
filter_params += "&tournoi_npc=1" filter_params += "&tournoi_npc=1"
# Fix the sorting parameter # Fix the sorting parameter
@ -477,7 +479,7 @@ def scrape_fft_all_tournaments(
# Default to ligue search if no city provided # Default to ligue search if no city provided
params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page" params = f"recherche_type=ligue&pratique=PADEL{date_component}{filter_params}&page={page}&sort={sorting_option or 'dateDebut+asc'}&form_build_id={form_build_id}&form_id=recherche_tournois_form&_triggering_element_name=submit_page&_triggering_element_value=Submit+page"
logger.info(f"AJAX Parameters: {params}") # logger.info(f"AJAX Parameters: {params}")
# Make AJAX request and capture the full response - EXACT same as club function # Make AJAX request and capture the full response - EXACT same as club function
ajax_script = f""" ajax_script = f"""
@ -512,23 +514,23 @@ def scrape_fft_all_tournaments(
}} }}
""" """
logger.info("Making AJAX request...") # logger.info("Making AJAX request...")
result = page_obj.evaluate(ajax_script) result = page_obj.evaluate(ajax_script)
browser.close() browser.close()
# Print the full response for debugging - EXACT same as club function # Print the full response for debugging - EXACT same as club function
logger.info(f"AJAX Response Status: {result.get('status')}") # logger.info(f"AJAX Response Status: {result.get('status')}")
logger.info(f"AJAX Response Success: {result.get('success')}") # logger.info(f"AJAX Response Success: {result.get('success')}")
if result.get("success"): if result.get("success"):
response_text = result.get("responseText", "") response_text = result.get("responseText", "")
logger.info(f"Raw Response Length: {len(response_text)}") # logger.info(f"Raw Response Length: {len(response_text)}")
try: try:
# Try to parse as JSON # Try to parse as JSON
json_data = json.loads(response_text) json_data = json.loads(response_text)
logger.info(f"JSON Response Type: {type(json_data)}") # logger.info(f"JSON Response Type: {type(json_data)}")
# Now try to parse it - EXACT same as club function # Now try to parse it - EXACT same as club function
parsed_result = _parse_ajax_response(json_data) parsed_result = _parse_ajax_response(json_data)
@ -536,11 +538,11 @@ def scrape_fft_all_tournaments(
return parsed_result return parsed_result
except json.JSONDecodeError as json_error: except json.JSONDecodeError as json_error:
logger.error(f"JSON Parse Error: {json_error}") # logger.error(f"JSON Parse Error: {json_error}")
logger.error(f"Response text: {response_text}") # logger.error(f"Response text: {response_text}")
return None return None
else: else:
logger.error(f"AJAX request failed: {result.get('error')}") # logger.error(f"AJAX request failed: {result.get('error')}")
return None return None
except Exception as e: except Exception as e:
@ -551,73 +553,277 @@ def scrape_fft_all_tournaments(
def get_umpire_data(tournament_id): def get_umpire_data(tournament_id):
""" """
Fast umpire data extraction using Playwright (optimized for speed) Umpire data extraction with improved Queue-It handling
""" """
logger.info(f"Getting umpire data for tournament {tournament_id}") # logger.info(f"Getting umpire data for tournament {tournament_id}")
try: try:
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch( browser = get_browser_for_environment(p)
headless=True,
args=[
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-images", # Don't load images
"--disable-javascript", # Disable JS for faster loading
"--disable-plugins",
"--disable-extensions",
],
)
page = browser.new_page() page = browser.new_page()
# Navigate to tournament page quickly # Navigate to tournament page
url = f"https://tenup.fft.fr/tournoi/{tournament_id}" url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
logger.info(f"Navigating to tournament page: {url}") # logger.info(f"Navigating to tournament page: {url}")
try: try:
# Fast navigation - don't wait for everything to load # Navigate with reasonable timeout
page.goto(url, timeout=15000, wait_until="domcontentloaded") page.goto(url, timeout=30000, wait_until="domcontentloaded")
# Quick Queue-It check # Enhanced Queue-It handling (similar to tournament search)
if "queue-it.net" in page.url.lower(): if "queue-it.net" in page.url.lower():
logger.warning("Hit Queue-It on tournament page") # logger.warning(f"Queue-It detected for tournament {tournament_id}")
browser.close()
return None, None, None # Wait strategy based on environment
max_queue_wait = (
# Extract data using the fastest method - regex on HTML content 120000 if not settings.DEBUG else 30000
) # 2 min prod, 30s dev
check_interval = 1000 # 10 seconds
elapsed_time = 0
# logger.info(
# f"Waiting in queue for umpire data (max {max_queue_wait / 1000}s)..."
# )
while (
elapsed_time < max_queue_wait
and "queue-it.net" in page.url.lower()
):
# logger.info(
# f"Umpire queue wait: {elapsed_time / 1000}s elapsed"
# )
page.wait_for_timeout(check_interval)
elapsed_time += check_interval
# Check if we've been redirected
if "queue-it.net" not in page.url.lower():
# logger.info(
# "Successfully passed through queue for umpire data!"
# )
break
# If still in queue, try fallback or return None
if "queue-it.net" in page.url.lower():
# logger.warning(
# "Umpire queue timeout - trying requests fallback"
# )
browser.close()
return _get_umpire_data_requests_fallback(tournament_id)
# Wait for page to load properly
page.wait_for_load_state("networkidle", timeout=20000)
# Extract data using multiple strategies
html_content = page.content() html_content = page.content()
# Extract name # Strategy 1: Try existing regex patterns
name_pattern = ( name, email, phone = _extract_umpire_with_regex(html_content)
r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
)
name_match = re.search(name_pattern, html_content)
name = name_match.group(1).strip() if name_match else None
# Extract email
email_pattern = r'mailto:([^"]+)"'
email_match = re.search(email_pattern, html_content)
email = email_match.group(1) if email_match else None
# Extract phone # Strategy 2: If regex fails, try DOM selectors
phone_pattern = r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>' if not name and not email and not phone:
phone_match = re.search(phone_pattern, html_content) name, email, phone = _extract_umpire_with_selectors(page)
phone = phone_match.group(1).strip() if phone_match else None
browser.close() browser.close()
logger.info( if name or email or phone:
f"Extracted umpire data: name={name}, email={email}, phone={phone}" # logger.info(
) # f"Successfully extracted umpire data: name={name}, email={email}, phone={phone}"
return name, email, phone # )
return name, email, phone
else:
logger.warning(
f"No umpire data found for tournament {tournament_id}"
)
return None, None, None
except Exception as page_error: except Exception as page_error:
logger.error(f"Error loading tournament page: {page_error}") logger.error(
f"Error loading tournament page {tournament_id}: {page_error}"
)
browser.close() browser.close()
return None, None, None
# Try requests fallback
logger.info("Trying requests fallback after Playwright error")
return _get_umpire_data_requests_fallback(tournament_id)
except Exception as e:
logger.error(f"Error in umpire data extraction for {tournament_id}: {e}")
return None, None, None
def _extract_umpire_with_regex(html_content):
"""
Extract umpire data using regex patterns
"""
# Extract name
name_pattern = r'tournoi-detail-page-inscription-responsable-title">\s*([^<]+)\s*<'
name_match = re.search(name_pattern, html_content)
name = name_match.group(1).strip() if name_match else None
# Extract email
email_pattern = r'mailto:([^"]+)"'
email_match = re.search(email_pattern, html_content)
email = email_match.group(1) if email_match else None
# Extract phone - try multiple patterns
phone_patterns = [
r'<div class="details-bloc">\s*(\d{2}\s+\d{2}\s+\d{2}\s+\d{2}\s+\d{2})\s*</div>',
r"(\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{2})",
r"(\d{10})",
r"(\+33\s?\d{1}\s?\d{2}\s?\d{2}\s?\d{2}\s?\d{2})",
]
phone = None
for pattern in phone_patterns:
phone_match = re.search(pattern, html_content)
if phone_match:
phone = phone_match.group(1).strip()
break
return name, email, phone
def _extract_umpire_with_selectors(page):
"""
Extract umpire data using DOM selectors as fallback
"""
name = None
email = None
phone = None
try:
# Try different selectors for name
name_selectors = [
".tournoi-detail-page-inscription-responsable-title",
'[class*="responsable-title"]',
'[class*="umpire-name"]',
'h3:has-text("Responsable")',
]
for selector in name_selectors:
try:
element = page.query_selector(selector)
if element:
name = element.inner_text().strip()
if name:
break
except:
continue
# Try different selectors for email
email_selectors = [
'a[href^="mailto:"]',
'[class*="email"]',
]
for selector in email_selectors:
try:
element = page.query_selector(selector)
if element:
href = element.get_attribute("href")
if href and href.startswith("mailto:"):
email = href.replace("mailto:", "")
break
text = element.inner_text().strip()
if "@" in text:
email = text
break
except:
continue
# Try different selectors for phone
phone_selectors = [
".details-bloc",
'[class*="phone"]',
'[class*="telephone"]',
]
for selector in phone_selectors:
try:
element = page.query_selector(selector)
if element:
text = element.inner_text().strip()
# Check if text looks like a phone number
if re.match(r"[\d\s\.\+\-\(\)]{8,}", text):
phone = text
break
except:
continue
except Exception as e:
logger.warning(f"Error in selector-based extraction: {e}")
return name, email, phone
def _get_umpire_data_requests_fallback(tournament_id):
"""
Enhanced fallback method using requests
"""
logger.info(f"Using requests fallback for tournament {tournament_id}")
try:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
url = f"https://tenup.fft.fr/tournoi/{tournament_id}"
response = requests.get(url, headers=headers, timeout=15)
if "queue-it.net" in response.url:
# logger.warning("Requests fallback also hit Queue-It")
return None, None, None
if response.status_code != 200:
# logger.error(f"Requests fallback failed: {response.status_code}")
return None, None, None
# Try regex extraction on requests response
name, email, phone = _extract_umpire_with_regex(response.text)
# If regex fails, try BeautifulSoup
if not name and not email and not phone:
try:
soup = BeautifulSoup(response.text, "html.parser")
# Try to find name
name_elements = soup.find_all(
class_=lambda x: x and "responsable" in x.lower()
)
if name_elements:
name = name_elements[0].get_text().strip()
# Try to find email
email_links = soup.find_all(
"a", href=lambda x: x and x.startswith("mailto:")
)
if email_links:
email = email_links[0]["href"].replace("mailto:", "")
# Try to find phone in various elements
for element in soup.find_all(text=True):
if re.search(
r"\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}[\s\.]\d{2}",
str(element),
):
phone = str(element).strip()
break
except Exception as soup_error:
logger.warning(f"BeautifulSoup parsing failed: {soup_error}")
logger.info(
f"Requests fallback result: name={name}, email={email}, phone={phone}"
)
return name, email, phone
except Exception as e: except Exception as e:
logger.error(f"Error in umpire data extraction: {e}") logger.error(f"Requests fallback error: {e}")
return None, None, None return None, None, None
@ -715,7 +921,7 @@ def scrape_fft_all_tournaments_concurrent(
ages=None, ages=None,
tournament_types=None, tournament_types=None,
national_cup=False, national_cup=False,
max_workers=5, max_workers=10,
): ):
""" """
Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end) Scrapes all remaining pages of FFT tournaments concurrently (pages 1 to end)
@ -777,8 +983,8 @@ def scrape_fft_all_tournaments_concurrent(
# Scrape all remaining pages concurrently (pages 1 to total_pages-1) # Scrape all remaining pages concurrently (pages 1 to total_pages-1)
all_tournaments = [] all_tournaments = []
max_concurrent = min(total_pages, 20)
with ThreadPoolExecutor(max_workers=max_workers) as executor: with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
futures = [] futures = []
for page in range(1, total_pages): for page in range(1, total_pages):
@ -834,7 +1040,7 @@ def _parse_clubs_ajax_response(json_data):
""" """
try: try:
# Log the raw response structure to understand what we're getting # Log the raw response structure to understand what we're getting
logger.info(f"Raw clubs response structure: {json_data}") # logger.info(f"Raw clubs response structure: {json_data}")
club_markers = [] club_markers = []
total_results = 0 total_results = 0
@ -937,7 +1143,7 @@ def scrape_federal_clubs(
try: try:
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch(headless=True) browser = get_browser_for_environment(p)
page_obj = browser.new_page() page_obj = browser.new_page()
page_obj.set_extra_http_headers( page_obj.set_extra_http_headers(
@ -949,8 +1155,6 @@ def scrape_federal_clubs(
# Clean up city name - remove zip code and extra info # Clean up city name - remove zip code and extra info
clean_city = city clean_city = city
if city: if city:
import re
clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip() clean_city = re.sub(r"[,\s]*\d{5}.*$", "", city).strip()
clean_city = clean_city.rstrip(",").strip() clean_city = clean_city.rstrip(",").strip()
@ -966,7 +1170,7 @@ def scrape_federal_clubs(
page_obj.goto(results_url) page_obj.goto(results_url)
# Wait for the page to load # Wait for the page to load
page_obj.wait_for_timeout(3000) page_obj.wait_for_timeout(2000)
# Check if we're in queue # Check if we're in queue
if "queue-it.net" in page_obj.url.lower(): if "queue-it.net" in page_obj.url.lower():
@ -1056,3 +1260,32 @@ def scrape_federal_clubs(
logger.error(f"Error in federal clubs scraping: {e}") logger.error(f"Error in federal clubs scraping: {e}")
logger.error(f"Traceback: {traceback.format_exc()}") logger.error(f"Traceback: {traceback.format_exc()}")
return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []} return {"typeRecherche": "clubs", "nombreResultat": 0, "club_markers": []}
def get_browser_for_environment(playwright_instance):
"""
Get appropriate browser based on environment
- Development (DEBUG=True): Use Firefox (works better on macOS 15+)
- Production (DEBUG=False): Use Chromium (as it was working)
"""
is_development = getattr(settings, "DEBUG", False)
if is_development:
# logger.info("Development environment detected - using Firefox")
try:
return playwright_instance.firefox.launch(
headless=True, args=["--no-sandbox"]
)
except Exception as firefox_error:
logger.warning(
f"Firefox failed in dev, falling back to Chromium: {firefox_error}"
)
return playwright_instance.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-dev-shm-usage", "--single-process"],
)
else:
logger.info("Production environment detected - using Chromium")
return playwright_instance.chromium.launch(
headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"]
)

Loading…
Cancel
Save