seleniumでウェブスクレイピング2

去年、スキーチケットの価格を楽に調査しようと思って作ったPythonSeleniumで作ったウェブスクレイピングはこちらでした。

tomo3i.hatenablog.com

そして、無事取得できたことがうれしくて、何度も実行した結果、Akamaiから不正アクセスIPとして認定されるという苦痛を味わったのは、そこに書いた通りです。それ以来Akamaiは嫌いです。Akamai使ってなくてもレスポンス良いサーバーなんていくらでもあると思う。

1週間ぐらいおとなしくしていると無事にいろんなアクセス制限が解除されたので、今回のところは最悪1週間待てばなんとかなる、という安心感?のもと、待ち時間をたっぷり入れて人間がアクセスしてるのと変わらない感じにしておきます。去年とは価格が書かれているサイトの構造がいくつか変わっていたので更新しました。

というわけで、全ソースコードを貼っておきます。くれぐれもご注意ください。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
import re
import time

url_epic4 = "https://www.epicpass.com/passes/epic-day-pass.aspx?days=4"
url_epic7 = "https://www.epicpass.com/passes/epic-day-pass.aspx?days=7"
url_epic_tahoe_local = "https://www.epicpass.com/passes/tahoe-local-pass.aspx"
url_epic_tahoe_value = "https://www.epicpass.com/passes/tahoe-value-pass.aspx"
url_epic = "https://www.epicpass.com/passes/epic-pass.aspx"

url_tahoe = "https://squawalpine.com/tickets-passes/lift-tickets/tahoe-super-4-lift-ticket-pack"

url_sugar_unristriced = "https://estore.sugarbowl.com/eStore/Content/Commerce/Products/DisplayProducts.aspx?ProductGroupCode=1101&AspxAutoDetectCookieSupport=1"
url_sugar_slightly = "https://estore.sugarbowl.com/eStore/Content/Commerce/Products/DisplayProducts.aspx?ProductGroupCode=1101&ProductCategoryCode=8023"

url_ikon = "https://www.ikonpass.com/en/shop-passes/ikon-pass-2019-2020"
url_ikon_base = "https://www.ikonpass.com/en/shop-passes/ikon-base-pass-2019-2020"


def epic_day(url):
    selector = "#epic_day_pass_detail__category__product_1 > div.epic_day_pass_detail__category__product_context.col-xs-7 > div.epic_day_pass_detail__category__product_price.c143__price--v1"
    
    options = Options()
    options.headless = True
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    e = wait.until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, selector)))
    result = e.text
    driver.close()
    return result

def epic(url):
    selector = "#c27_Product_Detail_0 > div.col-xs-3.hidden-xs.pass_category_detail__price_col > span"
    
    options = Options()
    options.headless = True
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    e = wait.until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, selector)))
    result = e.text
    driver.close()
    return result

def epic4():
    print ("epic4: " + epic_day(url_epic4))

def epic7():
    print ("epic7: " + epic_day(url_epic7))

def epic_tahoe_local():
    print ("epic tahoe local: " + epic(url_epic_tahoe_local))

def epic_tahoe_value():
    print ("epic tahoe value: " + epic(url_epic_tahoe_value))

def epic_epic():
    print ("epic: " + epic(url_epic))

def tahoe4(url):
    selector = "#content > div.wrapper.clearfix > article > div.field.field-name-body.field-type-text-with-summary.field-label-hidden > div > div > div > div > table > tbody > tr:nth-child(1) > th"

    options = Options()
    options.headless = True
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    e = wait.until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, selector)))
    pattern = "\$[0-9]+"
    print ("tahoe super4: " + re.search(pattern, e.text).group() )

    driver.close()


def sugar(url):
    selector = "#ctl00_ctl00_ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_contentMain_commerceMain_ProductSelectControl_ProductSelectList_gvProducts_ctl02_lblPrice"
    options = Options()
    options.headless = True
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    e = wait.until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, selector)))
    result = e.text
    driver.close()
    return result

def sugar_unristricted():
    print ("Sugar unristricted: " + sugar(url_sugar_unristriced))

def sugar_slightly():
    print ("Sugar slightly: " + sugar(url_sugar_slightly))

def ikon(url):
    selector = "#pass-cart-widget > div.pass-cart-widget-left > div.pass-cart-widget-passes > div:nth-child(1) > div.pass-cart-widget-pass-actions > h4"
    options = Options()
    options.headless = True
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    e = wait.until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, selector)))
    result = e.text
    driver.close()
    return result

def ikon_ikon():
    print("Ikon: " + ikon(url_ikon))

def ikon_base():
    print("Ikon base: " + ikon(url_ikon_base))

epic4()
time.sleep(5)
epic7()
time.sleep(6)
epic_tahoe_local()
time.sleep(7)
epic_tahoe_value()
time.sleep(5)
epic_epic()
time.sleep(6)

tahoe4(url_tahoe)
time.sleep(7)
sugar_unristricted()
time.sleep(5)
sugar_slightly()
time.sleep(6)
ikon_ikon()
time.sleep(7)
ikon_base()