#!/usr/bin/env python3 """ Gardenate Zone 8b Scraper - Selenium Click Mode Requirements: pip install selenium beautifulsoup4 lxml requests apt install chromium-browser chromium-chromedriver Usage: source venv/bin/activate && python3 scraper.py """ import json import time import os import re import requests as req_lib from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC BASE_URL = "https://www.gardenate.com" ZONE_PARAM = "USA+-+Zone+8b" BASE_DIR = os.path.dirname(os.path.abspath(__file__)) OUTPUT_FILE = os.path.join(BASE_DIR, "data", "plants.json") IMG_DIR = os.path.join(BASE_DIR, "images") DELAY = 2.0 MONTHS = { 1:"January", 2:"February", 3:"March", 4:"April", 5:"May", 6:"June", 7:"July", 8:"August", 9:"September",10:"October",11:"November",12:"December" } def make_driver(): options = Options() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-gpu") options.add_argument("--window-size=1920,1080") options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") for p in [ "/usr/bin/chromedriver", "/usr/lib/chromium-browser/chromedriver", "/usr/lib/chromium/chromedriver", "/snap/bin/chromedriver", ]: if os.path.exists(p): print(f"Using chromedriver: {p}") return webdriver.Chrome(service=Service(p), options=options) print("Using system chromedriver") return webdriver.Chrome(options=options) def slugify(name): s = name.lower() s = re.sub(r"\s+", "_", s) s = re.sub(r"[^a-z0-9_]", "", s) return s def download_image(img_session, plant_name, href, img_dir): """Download plant thumbnail from Gardenate, save locally.""" img_url = f"{BASE_URL}{href}/thumb/100" slug = slugify(plant_name) img_path = os.path.join(img_dir, f"{slug}.jpg") if os.path.exists(img_path) and os.path.getsize(img_path) > 500: return f"images/{slug}.jpg" try: resp = img_session.get(img_url, timeout=10) if resp.status_code == 200 and len(resp.content) > 500: with open(img_path, "wb") as f: f.write(resp.content) print(f" Downloaded image: {slug}.jpg") return f"images/{slug}.jpg" else: print(f" [WARN] No image for {plant_name} (HTTP {resp.status_code})") return "" except Exception as e: print(f" [WARN] Image error for {plant_name}: {e}") return "" def extract_plants_from_html(html, month_num): """Parse first plant-list div only from raw HTML.""" MARKER = '
' first = html.find(MARKER) if first == -1: print(" [ERROR] plant-list marker not found") return [] second = html.find(MARKER, first + len(MARKER)) if second != -1: html = html[:second] soup = BeautifulSoup(html, "lxml") plants = [] seen = set() for tr in soup.find_all("tr"): td = tr.find("td") if not td: continue row_div = td.find("div", class_="row") if not row_div: continue if "feed-ad" in row_div.get("class", []): continue name_col = None for div in row_div.find_all("div", recursive=False): if "col-md-6" in div.get("class", []): name_col = div break if not name_col: continue link = name_col.find("a", href=re.compile(r"^/plant/[^/]+$")) if not link: continue plant_name = link.get_text(strip=True) href = link.get("href", "") if not plant_name or plant_name in seen: continue action = "" for div in row_div.find_all("div", recursive=False): cls = div.get("class", []) if "d-none" in cls and "d-md-block" in cls and "col-md" in cls: action = div.get_text(strip=True) break if not action: mob = name_col.find("div", class_="d-md-none") if mob: action = mob.get_text(strip=True) aliases = [] italic = name_col.find("i") if italic: alias_text = italic.get_text(strip=True) alias_text = re.sub(r"^also\s*,?\s*", "", alias_text, flags=re.IGNORECASE).strip() if alias_text: aliases = [a.strip() for a in alias_text.split(",") if a.strip()] seen.add(plant_name) plants.append({ "name": plant_name, "aliases": aliases, "action": action, "href": href, "detail_url": f"{BASE_URL}{href}?zone={ZONE_PARAM}", }) return plants def parse_month_page(driver, month_num): print(f" Processing month {month_num} ({MONTHS[month_num]})") clicked = False # Strategy 1: anchor href containing m={month_num} if not clicked: try: btns = driver.find_elements(By.CSS_SELECTOR, f"a[href*='m={month_num}']") for btn in btns: href = btn.get_attribute("href") or "" if re.search(rf"[?&]m={month_num}(&|$)", href): driver.execute_script("arguments[0].click();", btn) clicked = True print(f" Clicked href: {href}") break except Exception as e: print(f" Strategy 1 failed: {e}") # Strategy 2: month-selector element by text if not clicked: try: abbrevs = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun", 7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"} sel = driver.find_element(By.CLASS_NAME, "month-selector") btns = sel.find_elements(By.TAG_NAME, "a") + sel.find_elements(By.TAG_NAME, "button") print(f" month-selector has {len(btns)} elements") for btn in btns: txt = btn.text.strip() if txt == str(month_num) or txt.startswith(abbrevs[month_num]): driver.execute_script("arguments[0].click();", btn) clicked = True print(f" Clicked text: '{txt}'") break except Exception as e: print(f" Strategy 2 failed: {e}") # Strategy 3: data-month or value attribute if not clicked: try: btn = driver.find_element(By.CSS_SELECTOR, f"[data-month='{month_num}'], [value='{month_num}']") driver.execute_script("arguments[0].click();", btn) clicked = True print(f" Clicked data-month={month_num}") except Exception as e: print(f" Strategy 3 failed: {e}") # Strategy 4: ") except Exception as e: print(f" Strategy 4 failed: {e}") if not clicked: print(f" [ERROR] Could not click month {month_num}") try: sel = driver.find_element(By.CLASS_NAME, "month-selector") print(f" Selector HTML:\n{sel.get_attribute('outerHTML')[:1000]}") except: print(" month-selector element not found") return [] time.sleep(3) html = driver.page_source plants = extract_plants_from_html(html, month_num) print(f" Found {len(plants)} plants in {MONTHS[month_num]}") return plants def parse_plant_detail(driver, plant_name, detail_url): print(f" Detail: {plant_name}") driver.get(detail_url) try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "body"))) except Exception: pass time.sleep(2) soup = BeautifulSoup(driver.page_source, "lxml") info = { "description": "", "spacing": "", "height": "", "row_spacing": "", "harvest_time": "", "pot_friendly": False, "notes": "", "compatible": [], "avoid": [], } desc_el = (soup.find("div", class_="description") or soup.find("div", class_="plant-description") or soup.find("p", class_="description")) if desc_el: info["description"] = desc_el.get_text(" ", strip=True) else: main = soup.find("main") or soup.find("div", id="content") or soup for p in main.find_all("p"): text = p.get_text(strip=True) if len(text) > 60: info["description"] = text break full = soup.get_text(" ", strip=True) def extract_after(text, label): m = re.search(re.escape(label) + r"[:\s]*([^\n]{1,120})", text, re.IGNORECASE) return m.group(1).strip() if m else "" info["spacing"] = extract_after(full, "Spacing") info["row_spacing"] = extract_after(full, "Row spacing") info["height"] = extract_after(full, "Height") info["harvest_time"] = extract_after(full, "Harvest") pot = re.search(r"can grow in (pots|containers)[:\s]*([^\n]+)", full, re.IGNORECASE) if pot: info["pot_friendly"] = pot.group(2).strip().lower().startswith(("yes","true")) notes_el = soup.find("div", class_="notes") or soup.find("div", class_="tips") if notes_el: info["notes"] = notes_el.get_text(" ", strip=True) cm = re.search(r"(?:grows well with|compatible)[:\s]+([^\n.]+)", full, re.IGNORECASE) if cm: info["compatible"] = [x.strip() for x in cm.group(1).split(",") if x.strip()] av = re.search(r"(?:avoid growing|keep away)[:\s]+([^\n.]+)", full, re.IGNORECASE) if av: info["avoid"] = [x.strip() for x in av.group(1).split(",") if x.strip()] return info def main(): print("=" * 60) print("Gardenate Zone 8b Scraper (Selenium - Click Mode)") print("=" * 60) os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) os.makedirs(IMG_DIR, exist_ok=True) if os.path.exists(OUTPUT_FILE): with open(OUTPUT_FILE, "r") as f: try: all_data = json.load(f) print(f"Resuming — {len(all_data.get('plants', {}))} plants already saved.") except json.JSONDecodeError: all_data = {"zone": "USA - Zone 8b", "plants": {}} else: all_data = {"zone": "USA - Zone 8b", "plants": {}} plants_db = all_data["plants"] img_session = req_lib.Session() img_session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; FarmGardenBot/1.0)"}) print("\nStarting Chrome (headless)...") driver = make_driver() try: print(f"\nLoading base page...") driver.get(f"{BASE_URL}/?zone={ZONE_PARAM}") time.sleep(4) try: sel = driver.find_element(By.CLASS_NAME, "month-selector") print(f"\nMonth selector HTML:\n{sel.get_attribute('outerHTML')[:2000]}\n") except Exception as e: print(f"Could not find month-selector: {e}") # Step 1: Scan all 12 months print("\n[Step 1] Scanning all 12 months...") month_plant_map = {} for month_num in range(1, 13): month_plants = parse_month_page(driver, month_num) month_plant_map[month_num] = month_plants time.sleep(1) # Step 2: Build plant database + download images print("\n[Step 2] Building plant database + downloading images...") for month_num, month_plants in month_plant_map.items(): for p in month_plants: slug = slugify(p["name"]) if slug not in plants_db: href = p.get("href", f"/plant/{slug}") img_path = download_image(img_session, p["name"], href, IMG_DIR) plants_db[slug] = { "name": p["name"], "aliases": p["aliases"], "image": img_path, "description": "", "spacing": "", "height": "", "row_spacing": "", "harvest_time": "", "pot_friendly": False, "notes": "", "compatible": [], "avoid": [], "months": {}, "detail_url": p["detail_url"], } time.sleep(0.5) plants_db[slug]["months"][str(month_num)] = p["action"] if p["action"] else "Plant" if p["aliases"] and not plants_db[slug]["aliases"]: plants_db[slug]["aliases"] = p["aliases"] with open(OUTPUT_FILE, "w") as f: json.dump(all_data, f, indent=2) print(f" Saved {len(plants_db)} plants to JSON") # Step 3: Fetch plant details print(f"\n[Step 3] Fetching details for {len(plants_db)} plants...") for i, (slug, plant) in enumerate(plants_db.items(), 1): if plant.get("description") and len(plant["description"]) > 20: print(f" [{i}/{len(plants_db)}] Skipping {plant['name']} (cached)") continue detail_url = plant.get("detail_url", f"{BASE_URL}/plant/{slug}?zone={ZONE_PARAM}") details = parse_plant_detail(driver, plant["name"], detail_url) plants_db[slug].update(details) with open(OUTPUT_FILE, "w") as f: json.dump(all_data, f, indent=2) time.sleep(DELAY) finally: driver.quit() with open(OUTPUT_FILE, "w") as f: json.dump(all_data, f, indent=2) print(f"\n[Done] Saved {len(plants_db)} plants to {OUTPUT_FILE}") print(f"Images saved to: {IMG_DIR}") if __name__ == "__main__": main()