#!/usr/bin/env python3
"""
Gardenate Zone 8b Scraper - Selenium Click Mode
Requirements: pip install selenium beautifulsoup4 lxml requests
apt install chromium-browser chromium-chromedriver
Usage: source venv/bin/activate && python3 scraper.py
"""
import json
import time
import os
import re
import requests as req_lib
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
BASE_URL = "https://www.gardenate.com"
ZONE_PARAM = "USA+-+Zone+8b"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FILE = os.path.join(BASE_DIR, "data", "plants.json")
IMG_DIR = os.path.join(BASE_DIR, "images")
DELAY = 2.0
MONTHS = {
1:"January", 2:"February", 3:"March", 4:"April",
5:"May", 6:"June", 7:"July", 8:"August",
9:"September",10:"October",11:"November",12:"December"
}
def make_driver():
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
for p in [
"/usr/bin/chromedriver",
"/usr/lib/chromium-browser/chromedriver",
"/usr/lib/chromium/chromedriver",
"/snap/bin/chromedriver",
]:
if os.path.exists(p):
print(f"Using chromedriver: {p}")
return webdriver.Chrome(service=Service(p), options=options)
print("Using system chromedriver")
return webdriver.Chrome(options=options)
def slugify(name):
s = name.lower()
s = re.sub(r"\s+", "_", s)
s = re.sub(r"[^a-z0-9_]", "", s)
return s
def download_image(img_session, plant_name, href, img_dir):
"""Download plant thumbnail from Gardenate, save locally."""
img_url = f"{BASE_URL}{href}/thumb/100"
slug = slugify(plant_name)
img_path = os.path.join(img_dir, f"{slug}.jpg")
if os.path.exists(img_path) and os.path.getsize(img_path) > 500:
return f"images/{slug}.jpg"
try:
resp = img_session.get(img_url, timeout=10)
if resp.status_code == 200 and len(resp.content) > 500:
with open(img_path, "wb") as f:
f.write(resp.content)
print(f" Downloaded image: {slug}.jpg")
return f"images/{slug}.jpg"
else:
print(f" [WARN] No image for {plant_name} (HTTP {resp.status_code})")
return ""
except Exception as e:
print(f" [WARN] Image error for {plant_name}: {e}")
return ""
def extract_plants_from_html(html, month_num):
"""Parse first plant-list div only from raw HTML."""
MARKER = '
'
first = html.find(MARKER)
if first == -1:
print(" [ERROR] plant-list marker not found")
return []
second = html.find(MARKER, first + len(MARKER))
if second != -1:
html = html[:second]
soup = BeautifulSoup(html, "lxml")
plants = []
seen = set()
for tr in soup.find_all("tr"):
td = tr.find("td")
if not td:
continue
row_div = td.find("div", class_="row")
if not row_div:
continue
if "feed-ad" in row_div.get("class", []):
continue
name_col = None
for div in row_div.find_all("div", recursive=False):
if "col-md-6" in div.get("class", []):
name_col = div
break
if not name_col:
continue
link = name_col.find("a", href=re.compile(r"^/plant/[^/]+$"))
if not link:
continue
plant_name = link.get_text(strip=True)
href = link.get("href", "")
if not plant_name or plant_name in seen:
continue
action = ""
for div in row_div.find_all("div", recursive=False):
cls = div.get("class", [])
if "d-none" in cls and "d-md-block" in cls and "col-md" in cls:
action = div.get_text(strip=True)
break
if not action:
mob = name_col.find("div", class_="d-md-none")
if mob:
action = mob.get_text(strip=True)
aliases = []
italic = name_col.find("i")
if italic:
alias_text = italic.get_text(strip=True)
alias_text = re.sub(r"^also\s*,?\s*", "", alias_text, flags=re.IGNORECASE).strip()
if alias_text:
aliases = [a.strip() for a in alias_text.split(",") if a.strip()]
seen.add(plant_name)
plants.append({
"name": plant_name,
"aliases": aliases,
"action": action,
"href": href,
"detail_url": f"{BASE_URL}{href}?zone={ZONE_PARAM}",
})
return plants
def parse_month_page(driver, month_num):
print(f" Processing month {month_num} ({MONTHS[month_num]})")
clicked = False
# Strategy 1: anchor href containing m={month_num}
if not clicked:
try:
btns = driver.find_elements(By.CSS_SELECTOR, f"a[href*='m={month_num}']")
for btn in btns:
href = btn.get_attribute("href") or ""
if re.search(rf"[?&]m={month_num}(&|$)", href):
driver.execute_script("arguments[0].click();", btn)
clicked = True
print(f" Clicked href: {href}")
break
except Exception as e:
print(f" Strategy 1 failed: {e}")
# Strategy 2: month-selector element by text
if not clicked:
try:
abbrevs = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",
7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
sel = driver.find_element(By.CLASS_NAME, "month-selector")
btns = sel.find_elements(By.TAG_NAME, "a") + sel.find_elements(By.TAG_NAME, "button")
print(f" month-selector has {len(btns)} elements")
for btn in btns:
txt = btn.text.strip()
if txt == str(month_num) or txt.startswith(abbrevs[month_num]):
driver.execute_script("arguments[0].click();", btn)
clicked = True
print(f" Clicked text: '{txt}'")
break
except Exception as e:
print(f" Strategy 2 failed: {e}")
# Strategy 3: data-month or value attribute
if not clicked:
try:
btn = driver.find_element(By.CSS_SELECTOR,
f"[data-month='{month_num}'], [value='{month_num}']")
driver.execute_script("arguments[0].click();", btn)
clicked = True
print(f" Clicked data-month={month_num}")
except Exception as e:
print(f" Strategy 3 failed: {e}")
# Strategy 4: