Improve scrapper to download multiple result pages

master
Laurent Morvillier 4 years ago
parent a57c0f13ec
commit 3a8ef77ef6
  1. 36
      TournamentStats.xcodeproj/xcuserdata/laurent.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
  2. 205
      TournamentStats/scrapper/Downloader.swift

@ -40,9 +40,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "188" startingLineNumber = "203"
endingLineNumber = "188" endingLineNumber = "203"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -56,9 +56,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "161" startingLineNumber = "177"
endingLineNumber = "161" endingLineNumber = "177"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -72,9 +72,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "137" startingLineNumber = "153"
endingLineNumber = "137" endingLineNumber = "153"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -88,9 +88,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "130" startingLineNumber = "146"
endingLineNumber = "130" endingLineNumber = "146"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -104,9 +104,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "138" startingLineNumber = "154"
endingLineNumber = "138" endingLineNumber = "154"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>
@ -120,9 +120,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift" filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807" startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807" endingColumnNumber = "9223372036854775807"
startingLineNumber = "153" startingLineNumber = "169"
endingLineNumber = "153" endingLineNumber = "169"
landmarkName = "downloadWSOPResults(tournamentInfo:)" landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7"> landmarkType = "7">
</BreakpointContent> </BreakpointContent>
</BreakpointProxy> </BreakpointProxy>

@ -18,13 +18,13 @@ class Downloader {
static func downloadWSOPResults() { static func downloadWSOPResults() {
let infos = [ let infos: [TS] = [
// TS(id: "19728", date: "02/10/2021"), // TS(id: "19728", date: "02/10/2021"),
// TS(id: "19729", date: "03/10/2021"), // TS(id: "19729", date: "03/10/2021"),
// TS(id: "19730", date: "02/10/2021"), // TS(id: "19730", date: "02/10/2021"),
// TS(id: "19732", date: "04/10/2021"), // TS(id: "19732", date: "04/10/2021"),
// TS(id: "19733", date: "05/10/2021"), // TS(id: "19733", date: "05/10/2021"),
// TS(id: "19731", date: "06/10/2021"), TS(id: "19731", date: "06/10/2021"),
// TS(id: "19734", date: "06/10/2021"), // TS(id: "19734", date: "06/10/2021"),
// TS(id: "19926", date: "06/10/2021"), // TS(id: "19926", date: "06/10/2021"),
// TS(id: "19927", date: "06/10/2021"), // TS(id: "19927", date: "06/10/2021"),
@ -40,7 +40,7 @@ class Downloader {
// TS(id: "19745", date: "12/10/2021"), // TS(id: "19745", date: "12/10/2021"),
// TS(id: "19747", date: "12/10/2021"), // TS(id: "19747", date: "12/10/2021"),
// TS(id: "19748", date: "12/10/2021"), // TS(id: "19748", date: "12/10/2021"),
TS(id: "19749", date: "13/10/2021"), // TS(id: "19749", date: "13/10/2021"),
] ]
for info in infos { for info in infos {
@ -59,12 +59,24 @@ class Downloader {
} }
fileprivate static func downloadWSOPResults(tournamentInfo: TS) { fileprivate static func downloadWSOPResults(tournamentInfo: TS) {
print("Importing tournament \(tournamentInfo.id)") print("Importing tournament \(tournamentInfo.id)")
guard let url = URL(string: "https://www.wsop.com/tournaments/results/?aid=2&grid=4865&tid=\(tournamentInfo.id)&rr=5") else { guard let url = URL(string: "https://www.wsop.com/tournaments/results/?aid=2&grid=4865&tid=\(tournamentInfo.id)&rr=5") else {
return return
} }
let realm = try! Realm()
do {
try realm.write {
self.downloadWSOPResults(url: url, date: tournamentInfo.date)
}
} catch {
print("error = \(error)")
}
}
fileprivate static func downloadWSOPResults(url: URL, date: String, currentTournament: Tournament? = nil) {
let realm = try! Realm() let realm = try! Realm()
@ -82,26 +94,28 @@ class Downloader {
if let infos = infos { if let infos = infos {
try realm.write { var tournament: Tournament? = currentTournament
let tournament = realm.create(Tournament.self) if tournament == nil {
let tour = realm.create(Tournament.self)
tournament = tour
if let name = infos.slice(from: "<h1>", to: "</h1>") { if let name = infos.slice(from: "<h1>", to: "</h1>") {
tournament.name = name tour.name = name
tournament.date = Formatter.basicDate.date(from: tournamentInfo.date) ?? Date() tour.date = Formatter.basicDate.date(from: date) ?? Date()
if let strNumber = name.slice(from: "Event #", to: ":"), let number = nf.number(from: strNumber) { if let strNumber = name.slice(from: "Event #", to: ":"), let number = nf.number(from: strNumber) {
tournament.number = number.intValue tour.number = number.intValue
} }
if name.contains("6-Handed") { if name.contains("6-Handed") {
tournament.tableSize = 6 tour.tableSize = 6
} else if name.contains("8-Handed") { } else if name.contains("8-Handed") {
tournament.tableSize = 8 tour.tableSize = 8
} else if name.contains("Heads-Up") { } else if name.contains("Heads-Up") {
tournament.tableSize = 2 tour.tableSize = 2
} }
} else { } else {
print("no name = \(infos)") print("no name = \(infos)")
} }
@ -109,101 +123,116 @@ class Downloader {
if let stats = string.slice(from: "<ul id=\"eventstats\">", to: "<div id=\"nav-tabs\">") { if let stats = string.slice(from: "<ul id=\"eventstats\">", to: "<div id=\"nav-tabs\">") {
if let strBuyin = stats.slice(from: "Buy-in:</b> &#36;", to: "</li>"), let buyin = nf.number(from: strBuyin) { if let strBuyin = stats.slice(from: "Buy-in:</b> &#36;", to: "</li>"), let buyin = nf.number(from: strBuyin) {
tournament.buyin = buyin.intValue tour.buyin = buyin.intValue
} else { } else {
print("no buyin = \(stats)") print("no buyin = \(stats)")
} }
if let strprizepool = stats.slice(from: "Prizepool:</b> &#36;", to: " </li>"), let pp = nf.number(from: strprizepool) { if let strprizepool = stats.slice(from: "Prizepool:</b> &#36;", to: " </li>"), let pp = nf.number(from: strprizepool) {
tournament.prizepool = pp.doubleValue tour.prizepool = pp.doubleValue
} else { } else {
print("no prizepool = \(stats)") print("no prizepool = \(stats)")
} }
if let strentries = stats.slice(from: "Entries:</b> ", to: " </li>"), let entries = nf.number(from: strentries) { if let strentries = stats.slice(from: "Entries:</b> ", to: " </li>"), let entries = nf.number(from: strentries) {
tournament.entries = entries.intValue tour.entries = entries.intValue
} else { } else {
print("no entries = \(stats)") print("no entries = \(stats)")
} }
} }
if let strresultsblock = string.slice(from: "Country</li>", to: "<div id=\"PagingNav\"") { }
let strresults = strresultsblock.components(separatedBy: "<li class='place")
if let strresultsblock = string.slice(from: "Country</li>", to: "<div id=\"PagingNav\"") {
let strresults = strresultsblock.components(separatedBy: "<li class='place")
print("Results count = \(strresults.count)")
for strresult in strresults {
print("Results count = \(strresults.count)") let cells = strresult.components(separatedBy: "<li")
if !(cells.count == 6 || cells.count == 7) {
print("cell count = \(cells.count)")
continue
}
for strresult in strresults { let hasPOY = cells.count == 7
let cells = strresult.components(separatedBy: "<li") var rank: Int? = nil
if !(cells.count == 6 || cells.count == 7) { var name: String? = nil
print("cell count = \(cells.count)") var earnings: Double? = nil
continue var city: String? = nil
} var state: String? = nil
var country: String? = nil
let hasPOY = cells.count == 7 var poy: Double? = nil
var rank: Int? = nil if let strrank = cells[0].slice(from: ">", to: "<") {
var name: String? = nil rank = nf.number(from: strrank)?.intValue
var earnings: Double? = nil }
var city: String? = nil
var state: String? = nil if let nameraw = cells[1].components(separatedBy: "'>").last {
var country: String? = nil name = nameraw.replacingOccurrences(of: "</a></li>", with: "")
var poy: Double? = nil }
if let strrank = cells[0].slice(from: ">", to: "<") { if let earningsraw = cells[2].components(separatedBy: "&#36;").last {
rank = nf.number(from: strrank)?.intValue let strearnings = earningsraw.replacingOccurrences(of: "</li>", with: "")
} earnings = nf.number(from: strearnings)?.doubleValue
}
if let nameraw = cells[1].components(separatedBy: "'>").last {
name = nameraw.replacingOccurrences(of: "</a></li>", with: "") if hasPOY, let poyraw = cells[3].components(separatedBy: "\'>").last {
} let strpoy = poyraw.replacingOccurrences(of: "</li>", with: "")
poy = nf.number(from: strpoy)?.doubleValue
if let earningsraw = cells[2].components(separatedBy: "&#36;").last { }
let strearnings = earningsraw.replacingOccurrences(of: "</li>", with: "")
earnings = nf.number(from: strearnings)?.doubleValue let cityIndex = hasPOY ? 4 : 3
} if let cityraw = cells[cityIndex].components(separatedBy: "\'>").last {
city = cityraw.replacingOccurrences(of: "</li>", with: "")
if hasPOY, let poyraw = cells[3].components(separatedBy: "\'>").last { }
let strpoy = poyraw.replacingOccurrences(of: "</li>", with: "")
poy = nf.number(from: strpoy)?.doubleValue let stateIndex = hasPOY ? 5 : 4
} if let stateraw
= cells[stateIndex].components(separatedBy: "\'>").last {
let cityIndex = hasPOY ? 4 : 3 state = stateraw.replacingOccurrences(of: "</li>", with: "")
if let cityraw = cells[cityIndex].components(separatedBy: "\'>").last { }
city = cityraw.replacingOccurrences(of: "</li>", with: "") let countryIndex = hasPOY ? 6 : 5
} if let countryraw
= cells[countryIndex].components(separatedBy: "\'>").last {
let stateIndex = hasPOY ? 5 : 4 country = String(countryraw.prefix(2))
if let stateraw }
= cells[stateIndex].components(separatedBy: "\'>").last {
state = stateraw.replacingOccurrences(of: "</li>", with: "") guard let rank = rank, let name = name, let earnings = earnings else {
} print("Missing data: R: \(rank), N: \(name), E: \(earnings)")
let countryIndex = hasPOY ? 6 : 5 continue
if let countryraw }
= cells[countryIndex].components(separatedBy: "\'>").last {
country = String(countryraw.prefix(2)) let player = realm.getOrCreatePlayer(name: name, city: city ?? "", state: state ?? "", country: country ?? "")
// country = countrytmp.prefix(2)
} let result = realm.create(Result.self)
result.player = player
guard let rank = rank, let name = name, let earnings = earnings else { result.earnings = earnings
print("Missing data: R: \(rank), N: \(name), E: \(earnings)") result.rank = rank
continue
tournament?.results.append(result)
}
if currentTournament == nil, let paging = string.slice(from: "<div id=\"PagingNav\"", to: "<div class=\"banner\">") {
let pages = paging.components(separatedBy: "<a href='")
if pages.count > 1 {
for i in 2..<pages.count { // don't do page 1 again
let components = pages[i].components(separatedBy: "'")
if let urlString = components.first,
let url = URL(string: "https://www.wsop.com\(urlString)") {
self.downloadWSOPResults(url: url, date: date, currentTournament: tournament)
}
} }
let player = realm.getOrCreatePlayer(name: name, city: city ?? "", state: state ?? "", country: country ?? "")
let result = realm.create(Result.self)
result.player = player
result.earnings = earnings
result.rank = rank
tournament.results.append(result)
} }
} else {
print("Bad page strresultsblock")
} }
} else {
print("Bad page strresultsblock")
} }
} else { } else {
print("Missing name block") print("Missing name block")

Loading…
Cancel
Save