Improve scrapper to download multiple result pages

master
Laurent Morvillier 4 years ago
parent a57c0f13ec
commit 3a8ef77ef6
  1. 36
      TournamentStats.xcodeproj/xcuserdata/laurent.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
  2. 173
      TournamentStats/scrapper/Downloader.swift

@ -40,9 +40,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "188" startingLineNumber = "203"
endingLineNumber = "188" endingLineNumber = "203"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -56,9 +56,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "161" startingLineNumber = "177"
endingLineNumber = "161" endingLineNumber = "177"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -72,9 +72,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "137" startingLineNumber = "153"
endingLineNumber = "137" endingLineNumber = "153"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -88,9 +88,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "130" startingLineNumber = "146"
endingLineNumber = "130" endingLineNumber = "146"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -104,9 +104,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "138" startingLineNumber = "154"
endingLineNumber = "138" endingLineNumber = "154"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -120,9 +120,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "153" startingLineNumber = "169"
endingLineNumber = "153" endingLineNumber = "169"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>

@ -18,13 +18,13 @@ class Downloader {
static func downloadWSOPResults() { static func downloadWSOPResults() {
let infos = [ let infos: [TS] = [
// TS(id: "19728", date: "02/10/2021"), // TS(id: "19728", date: "02/10/2021"),
// TS(id: "19729", date: "03/10/2021"), // TS(id: "19729", date: "03/10/2021"),
// TS(id: "19730", date: "02/10/2021"), // TS(id: "19730", date: "02/10/2021"),
// TS(id: "19732", date: "04/10/2021"), // TS(id: "19732", date: "04/10/2021"),
// TS(id: "19733", date: "05/10/2021"), // TS(id: "19733", date: "05/10/2021"),
// TS(id: "19731", date: "06/10/2021"), TS(id: "19731", date: "06/10/2021"),
// TS(id: "19734", date: "06/10/2021"), // TS(id: "19734", date: "06/10/2021"),
// TS(id: "19926", date: "06/10/2021"), // TS(id: "19926", date: "06/10/2021"),
// TS(id: "19927", date: "06/10/2021"), // TS(id: "19927", date: "06/10/2021"),
@ -40,7 +40,7 @@ class Downloader {
// TS(id: "19745", date: "12/10/2021"), // TS(id: "19745", date: "12/10/2021"),
// TS(id: "19747", date: "12/10/2021"), // TS(id: "19747", date: "12/10/2021"),
// TS(id: "19748", date: "12/10/2021"), // TS(id: "19748", date: "12/10/2021"),
TS(id: "19749", date: "13/10/2021"), // TS(id: "19749", date: "13/10/2021"),
] ]
for info in infos { for info in infos {
@ -65,6 +65,18 @@ class Downloader {
guard let url = URL(string: "https://www.wsop.com/tournaments/results/?aid=2&grid=4865&tid=\(tournamentInfo.id)&rr=5") else { guard let url = URL(string: "https://www.wsop.com/tournaments/results/?aid=2&grid=4865&tid=\(tournamentInfo.id)&rr=5") else {
return return
} }
let realm = try! Realm()
do {
try realm.write {
self.downloadWSOPResults(url: url, date: tournamentInfo.date)
}
} catch {
print("error = \(error)")
}
}
fileprivate static func downloadWSOPResults(url: URL, date: String, currentTournament: Tournament? = nil) {
let realm = try! Realm() let realm = try! Realm()
@ -82,24 +94,26 @@ class Downloader {
if let infos = infos { if let infos = infos {
try realm.write { var tournament: Tournament? = currentTournament
let tournament = realm.create(Tournament.self) if tournament == nil {
let tour = realm.create(Tournament.self)
tournament = tour
if let name = infos.slice(from: "<h1>", to: "</h1>") { if let name = infos.slice(from: "<h1>", to: "</h1>") {
tournament.name = name tour.name = name
tournament.date = Formatter.basicDate.date(from: tournamentInfo.date) ?? Date() tour.date = Formatter.basicDate.date(from: date) ?? Date()
if let strNumber = name.slice(from: "Event #", to: ":"), let number = nf.number(from: strNumber) { if let strNumber = name.slice(from: "Event #", to: ":"), let number = nf.number(from: strNumber) {
tournament.number = number.intValue tour.number = number.intValue
} }
if name.contains("6-Handed") { if name.contains("6-Handed") {
tournament.tableSize = 6 tour.tableSize = 6
} else if name.contains("8-Handed") { } else if name.contains("8-Handed") {
tournament.tableSize = 8 tour.tableSize = 8
} else if name.contains("Heads-Up") { } else if name.contains("Heads-Up") {
tournament.tableSize = 2 tour.tableSize = 2
} }
} else { } else {
@ -109,101 +123,116 @@ class Downloader {
if let stats = string.slice(from: "<ul id=\"eventstats\">", to: "<div id=\"nav-tabs\">") { if let stats = string.slice(from: "<ul id=\"eventstats\">", to: "<div id=\"nav-tabs\">") {
if let strBuyin = stats.slice(from: "Buy-in:</b> &#36;", to: "</li>"), let buyin = nf.number(from: strBuyin) { if let strBuyin = stats.slice(from: "Buy-in:</b> &#36;", to: "</li>"), let buyin = nf.number(from: strBuyin) {
tournament.buyin = buyin.intValue tour.buyin = buyin.intValue
} else { } else {
print("no buyin = \(stats)") print("no buyin = \(stats)")
} }
if let strprizepool = stats.slice(from: "Prizepool:</b> &#36;", to: " </li>"), let pp = nf.number(from: strprizepool) { if let strprizepool = stats.slice(from: "Prizepool:</b> &#36;", to: " </li>"), let pp = nf.number(from: strprizepool) {
tournament.prizepool = pp.doubleValue tour.prizepool = pp.doubleValue
} else { } else {
print("no prizepool = \(stats)") print("no prizepool = \(stats)")
} }
if let strentries = stats.slice(from: "Entries:</b> ", to: " </li>"), let entries = nf.number(from: strentries) { if let strentries = stats.slice(from: "Entries:</b> ", to: " </li>"), let entries = nf.number(from: strentries) {
tournament.entries = entries.intValue tour.entries = entries.intValue
} else { } else {
print("no entries = \(stats)") print("no entries = \(stats)")
} }
} }
if let strresultsblock = string.slice(from: "Country</li>", to: "<div id=\"PagingNav\"") { }
let strresults = strresultsblock.components(separatedBy: "<li class='place")
print("Results count = \(strresults.count)") if let strresultsblock = string.slice(from: "Country</li>", to: "<div id=\"PagingNav\"") {
let strresults = strresultsblock.components(separatedBy: "<li class='place")
for strresult in strresults { print("Results count = \(strresults.count)")
let cells = strresult.components(separatedBy: "<li") for strresult in strresults {
if !(cells.count == 6 || cells.count == 7) {
print("cell count = \(cells.count)")
continue
}
let hasPOY = cells.count == 7 let cells = strresult.components(separatedBy: "<li")
if !(cells.count == 6 || cells.count == 7) {
print("cell count = \(cells.count)")
continue
}
var rank: Int? = nil let hasPOY = cells.count == 7
var name: String? = nil
var earnings: Double? = nil
var city: String? = nil
var state: String? = nil
var country: String? = nil
var poy: Double? = nil
if let strrank = cells[0].slice(from: ">", to: "<") { var rank: Int? = nil
rank = nf.number(from: strrank)?.intValue var name: String? = nil
} var earnings: Double? = nil
var city: String? = nil
var state: String? = nil
var country: String? = nil
var poy: Double? = nil
if let nameraw = cells[1].components(separatedBy: "'>").last { if let strrank = cells[0].slice(from: ">", to: "<") {
name = nameraw.replacingOccurrences(of: "</a></li>", with: "") rank = nf.number(from: strrank)?.intValue
} }
if let earningsraw = cells[2].components(separatedBy: "&#36;").last { if let nameraw = cells[1].components(separatedBy: "'>").last {
let strearnings = earningsraw.replacingOccurrences(of: "</li>", with: "") name = nameraw.replacingOccurrences(of: "</a></li>", with: "")
earnings = nf.number(from: strearnings)?.doubleValue }
}
if hasPOY, let poyraw = cells[3].components(separatedBy: "\'>").last { if let earningsraw = cells[2].components(separatedBy: "&#36;").last {
let strpoy = poyraw.replacingOccurrences(of: "</li>", with: "") let strearnings = earningsraw.replacingOccurrences(of: "</li>", with: "")
poy = nf.number(from: strpoy)?.doubleValue earnings = nf.number(from: strearnings)?.doubleValue
} }
let cityIndex = hasPOY ? 4 : 3 if hasPOY, let poyraw = cells[3].components(separatedBy: "\'>").last {
if let cityraw = cells[cityIndex].components(separatedBy: "\'>").last { let strpoy = poyraw.replacingOccurrences(of: "</li>", with: "")
city = cityraw.replacingOccurrences(of: "</li>", with: "") poy = nf.number(from: strpoy)?.doubleValue
} }
let stateIndex = hasPOY ? 5 : 4 let cityIndex = hasPOY ? 4 : 3
if let stateraw if let cityraw = cells[cityIndex].components(separatedBy: "\'>").last {
= cells[stateIndex].components(separatedBy: "\'>").last { city = cityraw.replacingOccurrences(of: "</li>", with: "")
state = stateraw.replacingOccurrences(of: "</li>", with: "") }
}
let countryIndex = hasPOY ? 6 : 5
if let countryraw
= cells[countryIndex].components(separatedBy: "\'>").last {
country = String(countryraw.prefix(2))
// country = countrytmp.prefix(2)
}
guard let rank = rank, let name = name, let earnings = earnings else { let stateIndex = hasPOY ? 5 : 4
print("Missing data: R: \(rank), N: \(name), E: \(earnings)") if let stateraw
continue = cells[stateIndex].components(separatedBy: "\'>").last {
} state = stateraw.replacingOccurrences(of: "</li>", with: "")
}
let countryIndex = hasPOY ? 6 : 5
if let countryraw
= cells[countryIndex].components(separatedBy: "\'>").last {
country = String(countryraw.prefix(2))
}
let player = realm.getOrCreatePlayer(name: name, city: city ?? "", state: state ?? "", country: country ?? "") guard let rank = rank, let name = name, let earnings = earnings else {
print("Missing data: R: \(rank), N: \(name), E: \(earnings)")
continue
}
let player = realm.getOrCreatePlayer(name: name, city: city ?? "", state: state ?? "", country: country ?? "")
let result = realm.create(Result.self)
result.player = player
result.earnings = earnings
result.rank = rank
tournament?.results.append(result)
let result = realm.create(Result.self) }
result.player = player
result.earnings = earnings
result.rank = rank
tournament.results.append(result) if currentTournament == nil, let paging = string.slice(from: "<div id=\"PagingNav\"", to: "<div class=\"banner\">") {
let pages = paging.components(separatedBy: "<a href='")
if pages.count > 1 {
for i in 2..<pages.count { // don't do page 1 again
let components = pages[i].components(separatedBy: "'")
if let urlString = components.first,
let url = URL(string: "https://www.wsop.com\(urlString)") {
self.downloadWSOPResults(url: url, date: date, currentTournament: tournament)
}
}
} }
} else {
print("Bad page strresultsblock")
} }
} else {
print("Bad page strresultsblock")
} }
} else { } else {
print("Missing name block") print("Missing name block")

Loading…
Cancel
Save