Files
FARM/scraper.py
T
2026-06-25 21:29:21 +00:00

409 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Gardenate Zone 8b Scraper - Selenium Click Mode
Requirements: pip install selenium beautifulsoup4 lxml requests
apt install chromium-browser chromium-chromedriver
Usage: source venv/bin/activate && python3 scraper.py
"""
import json
import time
import os
import re
import requests as req_lib
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
BASE_URL = "https://www.gardenate.com"
ZONE_PARAM = "USA+-+Zone+8b"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FILE = os.path.join(BASE_DIR, "data", "plants.json")
IMG_DIR = os.path.join(BASE_DIR, "images")
DELAY = 2.0
MONTHS = {
1:"January", 2:"February", 3:"March", 4:"April",
5:"May", 6:"June", 7:"July", 8:"August",
9:"September",10:"October",11:"November",12:"December"
}
def make_driver():
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
for p in [
"/usr/bin/chromedriver",
"/usr/lib/chromium-browser/chromedriver",
"/usr/lib/chromium/chromedriver",
"/snap/bin/chromedriver",
]:
if os.path.exists(p):
print(f"Using chromedriver: {p}")
return webdriver.Chrome(service=Service(p), options=options)
print("Using system chromedriver")
return webdriver.Chrome(options=options)
def slugify(name):
s = name.lower()
s = re.sub(r"\s+", "_", s)
s = re.sub(r"[^a-z0-9_]", "", s)
return s
def download_image(img_session, plant_name, href, img_dir):
"""Download plant thumbnail from Gardenate, save locally."""
img_url = f"{BASE_URL}{href}/thumb/100"
slug = slugify(plant_name)
img_path = os.path.join(img_dir, f"{slug}.jpg")
if os.path.exists(img_path) and os.path.getsize(img_path) > 500:
return f"images/{slug}.jpg"
try:
resp = img_session.get(img_url, timeout=10)
if resp.status_code == 200 and len(resp.content) > 500:
with open(img_path, "wb") as f:
f.write(resp.content)
print(f" Downloaded image: {slug}.jpg")
return f"images/{slug}.jpg"
else:
print(f" [WARN] No image for {plant_name} (HTTP {resp.status_code})")
return ""
except Exception as e:
print(f" [WARN] Image error for {plant_name}: {e}")
return ""
def extract_plants_from_html(html, month_num):
"""Parse first plant-list div only from raw HTML."""
MARKER = '<div class="plant-list">'
first = html.find(MARKER)
if first == -1:
print(" [ERROR] plant-list marker not found")
return []
second = html.find(MARKER, first + len(MARKER))
if second != -1:
html = html[:second]
soup = BeautifulSoup(html, "lxml")
plants = []
seen = set()
for tr in soup.find_all("tr"):
td = tr.find("td")
if not td:
continue
row_div = td.find("div", class_="row")
if not row_div:
continue
if "feed-ad" in row_div.get("class", []):
continue
name_col = None
for div in row_div.find_all("div", recursive=False):
if "col-md-6" in div.get("class", []):
name_col = div
break
if not name_col:
continue
link = name_col.find("a", href=re.compile(r"^/plant/[^/]+$"))
if not link:
continue
plant_name = link.get_text(strip=True)
href = link.get("href", "")
if not plant_name or plant_name in seen:
continue
action = ""
for div in row_div.find_all("div", recursive=False):
cls = div.get("class", [])
if "d-none" in cls and "d-md-block" in cls and "col-md" in cls:
action = div.get_text(strip=True)
break
if not action:
mob = name_col.find("div", class_="d-md-none")
if mob:
action = mob.get_text(strip=True)
aliases = []
italic = name_col.find("i")
if italic:
alias_text = italic.get_text(strip=True)
alias_text = re.sub(r"^also\s*,?\s*", "", alias_text, flags=re.IGNORECASE).strip()
if alias_text:
aliases = [a.strip() for a in alias_text.split(",") if a.strip()]
seen.add(plant_name)
plants.append({
"name": plant_name,
"aliases": aliases,
"action": action,
"href": href,
"detail_url": f"{BASE_URL}{href}?zone={ZONE_PARAM}",
})
return plants
def parse_month_page(driver, month_num):
print(f" Processing month {month_num} ({MONTHS[month_num]})")
clicked = False
# Strategy 1: anchor href containing m={month_num}
if not clicked:
try:
btns = driver.find_elements(By.CSS_SELECTOR, f"a[href*='m={month_num}']")
for btn in btns:
href = btn.get_attribute("href") or ""
if re.search(rf"[?&]m={month_num}(&|$)", href):
driver.execute_script("arguments[0].click();", btn)
clicked = True
print(f" Clicked href: {href}")
break
except Exception as e:
print(f" Strategy 1 failed: {e}")
# Strategy 2: month-selector element by text
if not clicked:
try:
abbrevs = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",
7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
sel = driver.find_element(By.CLASS_NAME, "month-selector")
btns = sel.find_elements(By.TAG_NAME, "a") + sel.find_elements(By.TAG_NAME, "button")
print(f" month-selector has {len(btns)} elements")
for btn in btns:
txt = btn.text.strip()
if txt == str(month_num) or txt.startswith(abbrevs[month_num]):
driver.execute_script("arguments[0].click();", btn)
clicked = True
print(f" Clicked text: '{txt}'")
break
except Exception as e:
print(f" Strategy 2 failed: {e}")
# Strategy 3: data-month or value attribute
if not clicked:
try:
btn = driver.find_element(By.CSS_SELECTOR,
f"[data-month='{month_num}'], [value='{month_num}']")
driver.execute_script("arguments[0].click();", btn)
clicked = True
print(f" Clicked data-month={month_num}")
except Exception as e:
print(f" Strategy 3 failed: {e}")
# Strategy 4: <select> element
if not clicked:
try:
from selenium.webdriver.support.ui import Select
sel_el = driver.find_element(By.CSS_SELECTOR,
"select[name='m'], select.month, #month-select")
sel_obj = Select(sel_el)
sel_obj.select_by_value(str(month_num))
clicked = True
print(f" Selected via <select>")
except Exception as e:
print(f" Strategy 4 failed: {e}")
if not clicked:
print(f" [ERROR] Could not click month {month_num}")
try:
sel = driver.find_element(By.CLASS_NAME, "month-selector")
print(f" Selector HTML:\n{sel.get_attribute('outerHTML')[:1000]}")
except:
print(" month-selector element not found")
return []
time.sleep(3)
html = driver.page_source
plants = extract_plants_from_html(html, month_num)
print(f" Found {len(plants)} plants in {MONTHS[month_num]}")
return plants
def parse_plant_detail(driver, plant_name, detail_url):
print(f" Detail: {plant_name}")
driver.get(detail_url)
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
except Exception:
pass
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "lxml")
info = {
"description": "",
"spacing": "",
"height": "",
"row_spacing": "",
"harvest_time": "",
"pot_friendly": False,
"notes": "",
"compatible": [],
"avoid": [],
}
desc_el = (soup.find("div", class_="description") or
soup.find("div", class_="plant-description") or
soup.find("p", class_="description"))
if desc_el:
info["description"] = desc_el.get_text(" ", strip=True)
else:
main = soup.find("main") or soup.find("div", id="content") or soup
for p in main.find_all("p"):
text = p.get_text(strip=True)
if len(text) > 60:
info["description"] = text
break
full = soup.get_text(" ", strip=True)
def extract_after(text, label):
m = re.search(re.escape(label) + r"[:\s]*([^\n]{1,120})", text, re.IGNORECASE)
return m.group(1).strip() if m else ""
info["spacing"] = extract_after(full, "Spacing")
info["row_spacing"] = extract_after(full, "Row spacing")
info["height"] = extract_after(full, "Height")
info["harvest_time"] = extract_after(full, "Harvest")
pot = re.search(r"can grow in (pots|containers)[:\s]*([^\n]+)", full, re.IGNORECASE)
if pot:
info["pot_friendly"] = pot.group(2).strip().lower().startswith(("yes","true"))
notes_el = soup.find("div", class_="notes") or soup.find("div", class_="tips")
if notes_el:
info["notes"] = notes_el.get_text(" ", strip=True)
cm = re.search(r"(?:grows well with|compatible)[:\s]+([^\n.]+)", full, re.IGNORECASE)
if cm:
info["compatible"] = [x.strip() for x in cm.group(1).split(",") if x.strip()]
av = re.search(r"(?:avoid growing|keep away)[:\s]+([^\n.]+)", full, re.IGNORECASE)
if av:
info["avoid"] = [x.strip() for x in av.group(1).split(",") if x.strip()]
return info
def main():
print("=" * 60)
print("Gardenate Zone 8b Scraper (Selenium - Click Mode)")
print("=" * 60)
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
os.makedirs(IMG_DIR, exist_ok=True)
if os.path.exists(OUTPUT_FILE):
with open(OUTPUT_FILE, "r") as f:
try:
all_data = json.load(f)
print(f"Resuming — {len(all_data.get('plants', {}))} plants already saved.")
except json.JSONDecodeError:
all_data = {"zone": "USA - Zone 8b", "plants": {}}
else:
all_data = {"zone": "USA - Zone 8b", "plants": {}}
plants_db = all_data["plants"]
img_session = req_lib.Session()
img_session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; FarmGardenBot/1.0)"})
print("\nStarting Chrome (headless)...")
driver = make_driver()
try:
print(f"\nLoading base page...")
driver.get(f"{BASE_URL}/?zone={ZONE_PARAM}")
time.sleep(4)
try:
sel = driver.find_element(By.CLASS_NAME, "month-selector")
print(f"\nMonth selector HTML:\n{sel.get_attribute('outerHTML')[:2000]}\n")
except Exception as e:
print(f"Could not find month-selector: {e}")
# Step 1: Scan all 12 months
print("\n[Step 1] Scanning all 12 months...")
month_plant_map = {}
for month_num in range(1, 13):
month_plants = parse_month_page(driver, month_num)
month_plant_map[month_num] = month_plants
time.sleep(1)
# Step 2: Build plant database + download images
print("\n[Step 2] Building plant database + downloading images...")
for month_num, month_plants in month_plant_map.items():
for p in month_plants:
slug = slugify(p["name"])
if slug not in plants_db:
href = p.get("href", f"/plant/{slug}")
img_path = download_image(img_session, p["name"], href, IMG_DIR)
plants_db[slug] = {
"name": p["name"],
"aliases": p["aliases"],
"image": img_path,
"description": "",
"spacing": "",
"height": "",
"row_spacing": "",
"harvest_time": "",
"pot_friendly": False,
"notes": "",
"compatible": [],
"avoid": [],
"months": {},
"detail_url": p["detail_url"],
}
time.sleep(0.5)
plants_db[slug]["months"][str(month_num)] = p["action"] if p["action"] else "Plant"
if p["aliases"] and not plants_db[slug]["aliases"]:
plants_db[slug]["aliases"] = p["aliases"]
with open(OUTPUT_FILE, "w") as f:
json.dump(all_data, f, indent=2)
print(f" Saved {len(plants_db)} plants to JSON")
# Step 3: Fetch plant details
print(f"\n[Step 3] Fetching details for {len(plants_db)} plants...")
for i, (slug, plant) in enumerate(plants_db.items(), 1):
if plant.get("description") and len(plant["description"]) > 20:
print(f" [{i}/{len(plants_db)}] Skipping {plant['name']} (cached)")
continue
detail_url = plant.get("detail_url", f"{BASE_URL}/plant/{slug}?zone={ZONE_PARAM}")
details = parse_plant_detail(driver, plant["name"], detail_url)
plants_db[slug].update(details)
with open(OUTPUT_FILE, "w") as f:
json.dump(all_data, f, indent=2)
time.sleep(DELAY)
finally:
driver.quit()
with open(OUTPUT_FILE, "w") as f:
json.dump(all_data, f, indent=2)
print(f"\n[Done] Saved {len(plants_db)} plants to {OUTPUT_FILE}")
print(f"Images saved to: {IMG_DIR}")
if __name__ == "__main__":
main()