Improve scrapper to download multiple result pages

master
Laurent Morvillier 4 years ago
parent a57c0f13ec
commit 3a8ef77ef6
  1. 36
      TournamentStats.xcodeproj/xcuserdata/laurent.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
  2. 205
      TournamentStats/scrapper/Downloader.swift

@ -40,9 +40,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "188"
endingLineNumber = "188"
landmarkName = "downloadWSOPResults(tournamentInfo:)"
startingLineNumber = "203"
endingLineNumber = "203"
landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7">
</BreakpointContent>
</BreakpointProxy>
@ -56,9 +56,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "161"
endingLineNumber = "161"
landmarkName = "downloadWSOPResults(tournamentInfo:)"
startingLineNumber = "177"
endingLineNumber = "177"
landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7">
</BreakpointContent>
</BreakpointProxy>
@ -72,9 +72,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "137"
endingLineNumber = "137"
landmarkName = "downloadWSOPResults(tournamentInfo:)"
startingLineNumber = "153"
endingLineNumber = "153"
landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7">
</BreakpointContent>
</BreakpointProxy>
@ -88,9 +88,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "130"
endingLineNumber = "130"
landmarkName = "downloadWSOPResults(tournamentInfo:)"
startingLineNumber = "146"
endingLineNumber = "146"
landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7">
</BreakpointContent>
</BreakpointProxy>
@ -104,9 +104,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "138"
endingLineNumber = "138"
landmarkName = "downloadWSOPResults(tournamentInfo:)"
startingLineNumber = "154"
endingLineNumber = "154"
landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7">
</BreakpointContent>
</BreakpointProxy>
@ -120,9 +120,9 @@
filePath = "TournamentStats/scrapper/Downloader.swift"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "153"
endingLineNumber = "153"
landmarkName = "downloadWSOPResults(tournamentInfo:)"
startingLineNumber = "169"
endingLineNumber = "169"
landmarkName = "downloadWSOPResults(url:date:currentTournament:)"
landmarkType = "7">
</BreakpointContent>
</BreakpointProxy>

@ -18,13 +18,13 @@ class Downloader {
static func downloadWSOPResults() {
let infos = [
let infos: [TS] = [
// TS(id: "19728", date: "02/10/2021"),
// TS(id: "19729", date: "03/10/2021"),
// TS(id: "19730", date: "02/10/2021"),
// TS(id: "19732", date: "04/10/2021"),
// TS(id: "19733", date: "05/10/2021"),
// TS(id: "19731", date: "06/10/2021"),
TS(id: "19731", date: "06/10/2021"),
// TS(id: "19734", date: "06/10/2021"),
// TS(id: "19926", date: "06/10/2021"),
// TS(id: "19927", date: "06/10/2021"),
@ -40,7 +40,7 @@ class Downloader {
// TS(id: "19745", date: "12/10/2021"),
// TS(id: "19747", date: "12/10/2021"),
// TS(id: "19748", date: "12/10/2021"),
TS(id: "19749", date: "13/10/2021"),
// TS(id: "19749", date: "13/10/2021"),
]
for info in infos {
@ -59,12 +59,24 @@ class Downloader {
}
fileprivate static func downloadWSOPResults(tournamentInfo: TS) {
print("Importing tournament \(tournamentInfo.id)")
guard let url = URL(string: "https://www.wsop.com/tournaments/results/?aid=2&grid=4865&tid=\(tournamentInfo.id)&rr=5") else {
return
}
let realm = try! Realm()
do {
try realm.write {
self.downloadWSOPResults(url: url, date: tournamentInfo.date)
}
} catch {
print("error = \(error)")
}
}
fileprivate static func downloadWSOPResults(url: URL, date: String, currentTournament: Tournament? = nil) {
let realm = try! Realm()
@ -82,26 +94,28 @@ class Downloader {
if let infos = infos {
try realm.write {
let tournament = realm.create(Tournament.self)
var tournament: Tournament? = currentTournament
if tournament == nil {
let tour = realm.create(Tournament.self)
tournament = tour
if let name = infos.slice(from: "<h1>", to: "</h1>") {
tournament.name = name
tournament.date = Formatter.basicDate.date(from: tournamentInfo.date) ?? Date()
tour.name = name
tour.date = Formatter.basicDate.date(from: date) ?? Date()
if let strNumber = name.slice(from: "Event #", to: ":"), let number = nf.number(from: strNumber) {
tournament.number = number.intValue
tour.number = number.intValue
}
if name.contains("6-Handed") {
tournament.tableSize = 6
tour.tableSize = 6
} else if name.contains("8-Handed") {
tournament.tableSize = 8
tour.tableSize = 8
} else if name.contains("Heads-Up") {
tournament.tableSize = 2
tour.tableSize = 2
}
} else {
print("no name = \(infos)")
}
@ -109,101 +123,116 @@ class Downloader {
if let stats = string.slice(from: "<ul id=\"eventstats\">", to: "<div id=\"nav-tabs\">") {
if let strBuyin = stats.slice(from: "Buy-in:</b> &#36;", to: "</li>"), let buyin = nf.number(from: strBuyin) {
tournament.buyin = buyin.intValue
tour.buyin = buyin.intValue
} else {
print("no buyin = \(stats)")
}
if let strprizepool = stats.slice(from: "Prizepool:</b> &#36;", to: " </li>"), let pp = nf.number(from: strprizepool) {
tournament.prizepool = pp.doubleValue
tour.prizepool = pp.doubleValue
} else {
print("no prizepool = \(stats)")
}
if let strentries = stats.slice(from: "Entries:</b> ", to: " </li>"), let entries = nf.number(from: strentries) {
tournament.entries = entries.intValue
tour.entries = entries.intValue
} else {
print("no entries = \(stats)")
}
}
if let strresultsblock = string.slice(from: "Country</li>", to: "<div id=\"PagingNav\"") {
let strresults = strresultsblock.components(separatedBy: "<li class='place")
}
if let strresultsblock = string.slice(from: "Country</li>", to: "<div id=\"PagingNav\"") {
let strresults = strresultsblock.components(separatedBy: "<li class='place")
print("Results count = \(strresults.count)")
for strresult in strresults {
print("Results count = \(strresults.count)")
let cells = strresult.components(separatedBy: "<li")
if !(cells.count == 6 || cells.count == 7) {
print("cell count = \(cells.count)")
continue
}
for strresult in strresults {
let cells = strresult.components(separatedBy: "<li")
if !(cells.count == 6 || cells.count == 7) {
print("cell count = \(cells.count)")
continue
}
let hasPOY = cells.count == 7
var rank: Int? = nil
var name: String? = nil
var earnings: Double? = nil
var city: String? = nil
var state: String? = nil
var country: String? = nil
var poy: Double? = nil
if let strrank = cells[0].slice(from: ">", to: "<") {
rank = nf.number(from: strrank)?.intValue
}
if let nameraw = cells[1].components(separatedBy: "'>").last {
name = nameraw.replacingOccurrences(of: "</a></li>", with: "")
}
if let earningsraw = cells[2].components(separatedBy: "&#36;").last {
let strearnings = earningsraw.replacingOccurrences(of: "</li>", with: "")
earnings = nf.number(from: strearnings)?.doubleValue
}
if hasPOY, let poyraw = cells[3].components(separatedBy: "\'>").last {
let strpoy = poyraw.replacingOccurrences(of: "</li>", with: "")
poy = nf.number(from: strpoy)?.doubleValue
}
let cityIndex = hasPOY ? 4 : 3
if let cityraw = cells[cityIndex].components(separatedBy: "\'>").last {
city = cityraw.replacingOccurrences(of: "</li>", with: "")
}
let stateIndex = hasPOY ? 5 : 4
if let stateraw
= cells[stateIndex].components(separatedBy: "\'>").last {
state = stateraw.replacingOccurrences(of: "</li>", with: "")
}
let countryIndex = hasPOY ? 6 : 5
if let countryraw
= cells[countryIndex].components(separatedBy: "\'>").last {
country = String(countryraw.prefix(2))
// country = countrytmp.prefix(2)
}
guard let rank = rank, let name = name, let earnings = earnings else {
print("Missing data: R: \(rank), N: \(name), E: \(earnings)")
continue
let hasPOY = cells.count == 7
var rank: Int? = nil
var name: String? = nil
var earnings: Double? = nil
var city: String? = nil
var state: String? = nil
var country: String? = nil
var poy: Double? = nil
if let strrank = cells[0].slice(from: ">", to: "<") {
rank = nf.number(from: strrank)?.intValue
}
if let nameraw = cells[1].components(separatedBy: "'>").last {
name = nameraw.replacingOccurrences(of: "</a></li>", with: "")
}
if let earningsraw = cells[2].components(separatedBy: "&#36;").last {
let strearnings = earningsraw.replacingOccurrences(of: "</li>", with: "")
earnings = nf.number(from: strearnings)?.doubleValue
}
if hasPOY, let poyraw = cells[3].components(separatedBy: "\'>").last {
let strpoy = poyraw.replacingOccurrences(of: "</li>", with: "")
poy = nf.number(from: strpoy)?.doubleValue
}
let cityIndex = hasPOY ? 4 : 3
if let cityraw = cells[cityIndex].components(separatedBy: "\'>").last {
city = cityraw.replacingOccurrences(of: "</li>", with: "")
}
let stateIndex = hasPOY ? 5 : 4
if let stateraw
= cells[stateIndex].components(separatedBy: "\'>").last {
state = stateraw.replacingOccurrences(of: "</li>", with: "")
}
let countryIndex = hasPOY ? 6 : 5
if let countryraw
= cells[countryIndex].components(separatedBy: "\'>").last {
country = String(countryraw.prefix(2))
}
guard let rank = rank, let name = name, let earnings = earnings else {
print("Missing data: R: \(rank), N: \(name), E: \(earnings)")
continue
}
let player = realm.getOrCreatePlayer(name: name, city: city ?? "", state: state ?? "", country: country ?? "")
let result = realm.create(Result.self)
result.player = player
result.earnings = earnings
result.rank = rank
tournament?.results.append(result)
}
if currentTournament == nil, let paging = string.slice(from: "<div id=\"PagingNav\"", to: "<div class=\"banner\">") {
let pages = paging.components(separatedBy: "<a href='")
if pages.count > 1 {
for i in 2..<pages.count { // don't do page 1 again
let components = pages[i].components(separatedBy: "'")
if let urlString = components.first,
let url = URL(string: "https://www.wsop.com\(urlString)") {
self.downloadWSOPResults(url: url, date: date, currentTournament: tournament)
}
}
let player = realm.getOrCreatePlayer(name: name, city: city ?? "", state: state ?? "", country: country ?? "")
let result = realm.create(Result.self)
result.player = player
result.earnings = earnings
result.rank = rank
tournament.results.append(result)
}
} else {
print("Bad page strresultsblock")
}
} else {
print("Bad page strresultsblock")
}
} else {
print("Missing name block")

Loading…
Cancel
Save