FARM/scraper.py

#!/usr/bin/env python3
"""
Gardenate Zone 8b Scraper - Selenium Click Mode
Requirements: pip install selenium beautifulsoup4 lxml requests
              apt install chromium-browser chromium-chromedriver
Usage: source venv/bin/activate && python3 scraper.py
"""

import json
import time
import os
import re
import requests as req_lib
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

BASE_URL    = "https://www.gardenate.com"
ZONE_PARAM  = "USA+-+Zone+8b"
BASE_DIR    = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FILE = os.path.join(BASE_DIR, "data", "plants.json")
IMG_DIR     = os.path.join(BASE_DIR, "images")
DELAY       = 2.0

MONTHS = {
    1:"January", 2:"February", 3:"March",    4:"April",
    5:"May",     6:"June",     7:"July",      8:"August",
    9:"September",10:"October",11:"November",12:"December"
}


def make_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    for p in [
        "/usr/bin/chromedriver",
        "/usr/lib/chromium-browser/chromedriver",
        "/usr/lib/chromium/chromedriver",
        "/snap/bin/chromedriver",
    ]:
        if os.path.exists(p):
            print(f"Using chromedriver: {p}")
            return webdriver.Chrome(service=Service(p), options=options)
    print("Using system chromedriver")
    return webdriver.Chrome(options=options)


def slugify(name):
    s = name.lower()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "", s)
    return s


def download_image(img_session, plant_name, href, img_dir):
    """Download plant thumbnail from Gardenate, save locally."""
    img_url  = f"{BASE_URL}{href}/thumb/100"
    slug     = slugify(plant_name)
    img_path = os.path.join(img_dir, f"{slug}.jpg")

    if os.path.exists(img_path) and os.path.getsize(img_path) > 500:
        return f"images/{slug}.jpg"

    try:
        resp = img_session.get(img_url, timeout=10)
        if resp.status_code == 200 and len(resp.content) > 500:
            with open(img_path, "wb") as f:
                f.write(resp.content)
            print(f"      Downloaded image: {slug}.jpg")
            return f"images/{slug}.jpg"
        else:
            print(f"      [WARN] No image for {plant_name} (HTTP {resp.status_code})")
            return ""
    except Exception as e:
        print(f"      [WARN] Image error for {plant_name}: {e}")
        return ""


def extract_plants_from_html(html, month_num):
    """Parse first plant-list div only from raw HTML."""
    MARKER = '<div class="plant-list">'
    first  = html.find(MARKER)
    if first == -1:
        print("  [ERROR] plant-list marker not found")
        return []
    second = html.find(MARKER, first + len(MARKER))
    if second != -1:
        html = html[:second]

    soup   = BeautifulSoup(html, "lxml")
    plants = []
    seen   = set()

    for tr in soup.find_all("tr"):
        td = tr.find("td")
        if not td:
            continue
        row_div = td.find("div", class_="row")
        if not row_div:
            continue
        if "feed-ad" in row_div.get("class", []):
            continue

        name_col = None
        for div in row_div.find_all("div", recursive=False):
            if "col-md-6" in div.get("class", []):
                name_col = div
                break
        if not name_col:
            continue

        link = name_col.find("a", href=re.compile(r"^/plant/[^/]+$"))
        if not link:
            continue

        plant_name = link.get_text(strip=True)
        href       = link.get("href", "")
        if not plant_name or plant_name in seen:
            continue

        action = ""
        for div in row_div.find_all("div", recursive=False):
            cls = div.get("class", [])
            if "d-none" in cls and "d-md-block" in cls and "col-md" in cls:
                action = div.get_text(strip=True)
                break
        if not action:
            mob = name_col.find("div", class_="d-md-none")
            if mob:
                action = mob.get_text(strip=True)

        aliases = []
        italic  = name_col.find("i")
        if italic:
            alias_text = italic.get_text(strip=True)
            alias_text = re.sub(r"^also\s*,?\s*", "", alias_text, flags=re.IGNORECASE).strip()
            if alias_text:
                aliases = [a.strip() for a in alias_text.split(",") if a.strip()]

        seen.add(plant_name)
        plants.append({
            "name":       plant_name,
            "aliases":    aliases,
            "action":     action,
            "href":       href,
            "detail_url": f"{BASE_URL}{href}?zone={ZONE_PARAM}",
        })

    return plants


def parse_month_page(driver, month_num):
    print(f"  Processing month {month_num} ({MONTHS[month_num]})")
    clicked = False

    # Strategy 1: anchor href containing m={month_num}
    if not clicked:
        try:
            btns = driver.find_elements(By.CSS_SELECTOR, f"a[href*='m={month_num}']")
            for btn in btns:
                href = btn.get_attribute("href") or ""
                if re.search(rf"[?&]m={month_num}(&|$)", href):
                    driver.execute_script("arguments[0].click();", btn)
                    clicked = True
                    print(f"    Clicked href: {href}")
                    break
        except Exception as e:
            print(f"    Strategy 1 failed: {e}")

    # Strategy 2: month-selector element by text
    if not clicked:
        try:
            abbrevs = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",
                       7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
            sel  = driver.find_element(By.CLASS_NAME, "month-selector")
            btns = sel.find_elements(By.TAG_NAME, "a") + sel.find_elements(By.TAG_NAME, "button")
            print(f"    month-selector has {len(btns)} elements")
            for btn in btns:
                txt = btn.text.strip()
                if txt == str(month_num) or txt.startswith(abbrevs[month_num]):
                    driver.execute_script("arguments[0].click();", btn)
                    clicked = True
                    print(f"    Clicked text: '{txt}'")
                    break
        except Exception as e:
            print(f"    Strategy 2 failed: {e}")

    # Strategy 3: data-month or value attribute
    if not clicked:
        try:
            btn = driver.find_element(By.CSS_SELECTOR,
                f"[data-month='{month_num}'], [value='{month_num}']")
            driver.execute_script("arguments[0].click();", btn)
            clicked = True
            print(f"    Clicked data-month={month_num}")
        except Exception as e:
            print(f"    Strategy 3 failed: {e}")

    # Strategy 4: <select> element
    if not clicked:
        try:
            from selenium.webdriver.support.ui import Select
            sel_el  = driver.find_element(By.CSS_SELECTOR,
                "select[name='m'], select.month, #month-select")
            sel_obj = Select(sel_el)
            sel_obj.select_by_value(str(month_num))
            clicked = True
            print(f"    Selected via <select>")
        except Exception as e:
            print(f"    Strategy 4 failed: {e}")

    if not clicked:
        print(f"    [ERROR] Could not click month {month_num}")
        try:
            sel = driver.find_element(By.CLASS_NAME, "month-selector")
            print(f"    Selector HTML:\n{sel.get_attribute('outerHTML')[:1000]}")
        except:
            print("    month-selector element not found")
        return []

    time.sleep(3)
    html   = driver.page_source
    plants = extract_plants_from_html(html, month_num)
    print(f"  Found {len(plants)} plants in {MONTHS[month_num]}")
    return plants


def parse_plant_detail(driver, plant_name, detail_url):
    print(f"    Detail: {plant_name}")
    driver.get(detail_url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
    except Exception:
        pass
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "lxml")
    info = {
        "description":  "",
        "spacing":       "",
        "height":        "",
        "row_spacing":   "",
        "harvest_time":  "",
        "pot_friendly":  False,
        "notes":         "",
        "compatible":    [],
        "avoid":         [],
    }

    desc_el = (soup.find("div", class_="description") or
               soup.find("div", class_="plant-description") or
               soup.find("p",   class_="description"))
    if desc_el:
        info["description"] = desc_el.get_text(" ", strip=True)
    else:
        main = soup.find("main") or soup.find("div", id="content") or soup
        for p in main.find_all("p"):
            text = p.get_text(strip=True)
            if len(text) > 60:
                info["description"] = text
                break

    full = soup.get_text(" ", strip=True)

    def extract_after(text, label):
        m = re.search(re.escape(label) + r"[:\s]*([^\n]{1,120})", text, re.IGNORECASE)
        return m.group(1).strip() if m else ""

    info["spacing"]      = extract_after(full, "Spacing")
    info["row_spacing"]  = extract_after(full, "Row spacing")
    info["height"]       = extract_after(full, "Height")
    info["harvest_time"] = extract_after(full, "Harvest")

    pot = re.search(r"can grow in (pots|containers)[:\s]*([^\n]+)", full, re.IGNORECASE)
    if pot:
        info["pot_friendly"] = pot.group(2).strip().lower().startswith(("yes","true"))

    notes_el = soup.find("div", class_="notes") or soup.find("div", class_="tips")
    if notes_el:
        info["notes"] = notes_el.get_text(" ", strip=True)

    cm = re.search(r"(?:grows well with|compatible)[:\s]+([^\n.]+)", full, re.IGNORECASE)
    if cm:
        info["compatible"] = [x.strip() for x in cm.group(1).split(",") if x.strip()]

    av = re.search(r"(?:avoid growing|keep away)[:\s]+([^\n.]+)", full, re.IGNORECASE)
    if av:
        info["avoid"] = [x.strip() for x in av.group(1).split(",") if x.strip()]

    return info


def main():
    print("=" * 60)
    print("Gardenate Zone 8b Scraper (Selenium - Click Mode)")
    print("=" * 60)

    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    os.makedirs(IMG_DIR, exist_ok=True)

    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, "r") as f:
            try:
                all_data = json.load(f)
                print(f"Resuming — {len(all_data.get('plants', {}))} plants already saved.")
            except json.JSONDecodeError:
                all_data = {"zone": "USA - Zone 8b", "plants": {}}
    else:
        all_data = {"zone": "USA - Zone 8b", "plants": {}}

    plants_db = all_data["plants"]

    img_session = req_lib.Session()
    img_session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; FarmGardenBot/1.0)"})

    print("\nStarting Chrome (headless)...")
    driver = make_driver()

    try:
        print(f"\nLoading base page...")
        driver.get(f"{BASE_URL}/?zone={ZONE_PARAM}")
        time.sleep(4)

        try:
            sel = driver.find_element(By.CLASS_NAME, "month-selector")
            print(f"\nMonth selector HTML:\n{sel.get_attribute('outerHTML')[:2000]}\n")
        except Exception as e:
            print(f"Could not find month-selector: {e}")

        # Step 1: Scan all 12 months
        print("\n[Step 1] Scanning all 12 months...")
        month_plant_map = {}
        for month_num in range(1, 13):
            month_plants = parse_month_page(driver, month_num)
            month_plant_map[month_num] = month_plants
            time.sleep(1)

        # Step 2: Build plant database + download images
        print("\n[Step 2] Building plant database + downloading images...")
        for month_num, month_plants in month_plant_map.items():
            for p in month_plants:
                slug = slugify(p["name"])
                if slug not in plants_db:
                    href     = p.get("href", f"/plant/{slug}")
                    img_path = download_image(img_session, p["name"], href, IMG_DIR)
                    plants_db[slug] = {
                        "name":         p["name"],
                        "aliases":      p["aliases"],
                        "image":        img_path,
                        "description":  "",
                        "spacing":      "",
                        "height":       "",
                        "row_spacing":  "",
                        "harvest_time": "",
                        "pot_friendly": False,
                        "notes":        "",
                        "compatible":   [],
                        "avoid":        [],
                        "months":       {},
                        "detail_url":   p["detail_url"],
                    }
                    time.sleep(0.5)

                plants_db[slug]["months"][str(month_num)] = p["action"] if p["action"] else "Plant"
                if p["aliases"] and not plants_db[slug]["aliases"]:
                    plants_db[slug]["aliases"] = p["aliases"]

        with open(OUTPUT_FILE, "w") as f:
            json.dump(all_data, f, indent=2)
        print(f"  Saved {len(plants_db)} plants to JSON")

        # Step 3: Fetch plant details
        print(f"\n[Step 3] Fetching details for {len(plants_db)} plants...")
        for i, (slug, plant) in enumerate(plants_db.items(), 1):
            if plant.get("description") and len(plant["description"]) > 20:
                print(f"  [{i}/{len(plants_db)}] Skipping {plant['name']} (cached)")
                continue

            detail_url = plant.get("detail_url", f"{BASE_URL}/plant/{slug}?zone={ZONE_PARAM}")
            details    = parse_plant_detail(driver, plant["name"], detail_url)
            plants_db[slug].update(details)

            with open(OUTPUT_FILE, "w") as f:
                json.dump(all_data, f, indent=2)

            time.sleep(DELAY)

    finally:
        driver.quit()

    with open(OUTPUT_FILE, "w") as f:
        json.dump(all_data, f, indent=2)

    print(f"\n[Done] Saved {len(plants_db)} plants to {OUTPUT_FILE}")
    print(f"Images saved to: {IMG_DIR}")


if __name__ == "__main__":
    main()