from selenium import webdriver
import time
import pandas as pd
from bs4 import BeautifulSoup
import datetime
from datetime import timedelta


def yanolja_crawl(url):
    global df # 전역 변수를 지역 범위에서 적용
    driver.get(url)

    hotel = driver.find_element_by_css_selector('section.PlaceDetailTitle_titleContainer__3sGdf h1').text
    hotel = hotel.replace("[★숙박대전] ", "")

    # 리뷰 페이지 클릭 (Click review page)
    driver.find_element_by_css_selector('section.PlaceDetailTitle_titleContainer__3sGdf > a').click()

    # 무한 스크롤
    import time
    time.sleep(1)

    SCROLL_PAUSE_TIME = 1.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight-50);")
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            break

        last_height = new_height

    # 스크롤 끝나고 페이지 elements 가져오기
    raw = driver.page_source # 페이지의 elements 모두 가져오기
    html = BeautifulSoup(raw, "html.parser")

    # 리뷰 텍스트 선택자: div.item-wrap
    reviews = html.select("div.item-wrap")

    columns = ['hotel', 'star', 'review', 'date']

    for r in reviews:
        review = r.select_one("p")
        date = r.select_one('time')

        stars = r.select('.container.score')
        for s in stars:
            star_count = 5 - len(s.select(".ico.gray"))

        df = df.append(pd.DataFrame([[hotel, star_count, review.text, date.text]], columns = columns),
                       ignore_index=True)
    
def change_date(x):
    now = datetime.datetime.today()
    today = now.strftime("%Y. %m. %d")
    yesterday = (now - timedelta(days=1)).strftime("%Y. %m. %d")
    _2days_ago = (now - timedelta(days = 2)).strftime("%Y. %m. %d")
    _3days_ago = (now - timedelta(days = 3)).strftime("%Y. %m. %d")
    hour = int(datetime.datetime.today().strftime("%H"))
    if "시간 전" in x:
        x = int(x.replace("시간 전", ""))
        if hour - x > 0: return today
        else: return yesterday
    elif "일 전" in x:
        if "1" in x: return yesterday
        elif "2" in x: return _2days_ago
        else: return _3days_ago
    else: return x


url_list = [# "https://www.yanolja.com/hotel/3001542", # 신라스테이 광화문
            # "https://www.yanolja.com/hotel/3009497", # 나인트리 프리미어 명동2
            # "https://www.yanolja.com/hotel/3008478", # 롯데 호텔 서울
            # "https://www.yanolja.com/hotel/3000775" # 그랜드 워커힐 서울
            "https://www.yanolja.com/hotel/3015391" # 노보텔 용산
           ]

columns = ['hotel', 'star', 'review', 'date']
df = pd.DataFrame(columns = columns) # 데이터프레임 생성


driver = webdriver.Chrome('./chromedriver')
for url in url_list:
    yanolja_crawl(url)

# driver.close()


df['real_date'] = df['date'].apply(change_date)

df


df = df.rename(columns={'real_date': 'date'})


# df.to_excel("야놀자_최종.xlsx")


# df.to_csv("야놀자_최종.csv")

	hotel	star	review	date	real_date
0	노보텔 앰배서더 서울 용산	5	역시 시설도 깨끗하고 좋았어요\n친절했구요	15시간 전	2020. 12. 02
1	노보텔 앰배서더 서울 용산	4	되게좋앗어요	20시간 전	2020. 12. 01
2	노보텔 앰배서더 서울 용산	5	방도 깨끗하고 위치도 좋았습니다!	23시간 전	2020. 12. 01
3	노보텔 앰배서더 서울 용산	4	가성비도 좋고 분위기가 좋아요	1일 전	2020. 12. 01
4	노보텔 앰배서더 서울 용산	3	안녕하세요	1일 전	2020. 12. 01
...	...	...	...	...	...
2238	노보텔 앰배서더 서울 용산	5	좋아요 너무너무	2018. 07. 31	2018. 07. 31
2239	노보텔 앰배서더 서울 용산	5	체크인이 오래걸려요	2018. 07. 29	2018. 07. 29
2240	노보텔 앰배서더 서울 용산	5	위치랑 방도 깨끗하고 좋네요	2018. 07. 27	2018. 07. 27
2241	노보텔 앰배서더 서울 용산	5	깨끗하고 침대 너무 좋아요 다음에 또 이용할거에요	2018. 07. 23	2018. 07. 23
2242	노보텔 앰배서더 서울 용산	5	위치,서비스,룸컨디션,인테리어\n다좋았는데 방음에 취약했어요~\n주말이라 가족단위 ...	2018. 07. 22	2018. 07. 22

크롤링에 들어가기 전¶

필요한 라이브러리 import ¶

크롤링 함수¶

데이터 프레임에 크롤링 데이터 담아주기¶

파일 내보내기¶

크롤링에 들어가기 전¶

필요한 라이브러리 import¶

크롤링 함수¶

데이터 프레임에 크롤링 데이터 담아주기¶

파일 내보내기¶

필요한 라이브러리 import ¶