409 lines
14 KiB
Python
409 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Gardenate Zone 8b Scraper - Selenium Click Mode
|
|
Requirements: pip install selenium beautifulsoup4 lxml requests
|
|
apt install chromium-browser chromium-chromedriver
|
|
Usage: source venv/bin/activate && python3 scraper.py
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import os
|
|
import re
|
|
import requests as req_lib
|
|
from bs4 import BeautifulSoup
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
BASE_URL = "https://www.gardenate.com"
|
|
ZONE_PARAM = "USA+-+Zone+8b"
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
OUTPUT_FILE = os.path.join(BASE_DIR, "data", "plants.json")
|
|
IMG_DIR = os.path.join(BASE_DIR, "images")
|
|
DELAY = 2.0
|
|
|
|
MONTHS = {
|
|
1:"January", 2:"February", 3:"March", 4:"April",
|
|
5:"May", 6:"June", 7:"July", 8:"August",
|
|
9:"September",10:"October",11:"November",12:"December"
|
|
}
|
|
|
|
|
|
def make_driver():
|
|
options = Options()
|
|
options.add_argument("--headless")
|
|
options.add_argument("--no-sandbox")
|
|
options.add_argument("--disable-dev-shm-usage")
|
|
options.add_argument("--disable-gpu")
|
|
options.add_argument("--window-size=1920,1080")
|
|
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
for p in [
|
|
"/usr/bin/chromedriver",
|
|
"/usr/lib/chromium-browser/chromedriver",
|
|
"/usr/lib/chromium/chromedriver",
|
|
"/snap/bin/chromedriver",
|
|
]:
|
|
if os.path.exists(p):
|
|
print(f"Using chromedriver: {p}")
|
|
return webdriver.Chrome(service=Service(p), options=options)
|
|
print("Using system chromedriver")
|
|
return webdriver.Chrome(options=options)
|
|
|
|
|
|
def slugify(name):
|
|
s = name.lower()
|
|
s = re.sub(r"\s+", "_", s)
|
|
s = re.sub(r"[^a-z0-9_]", "", s)
|
|
return s
|
|
|
|
|
|
def download_image(img_session, plant_name, href, img_dir):
|
|
"""Download plant thumbnail from Gardenate, save locally."""
|
|
img_url = f"{BASE_URL}{href}/thumb/100"
|
|
slug = slugify(plant_name)
|
|
img_path = os.path.join(img_dir, f"{slug}.jpg")
|
|
|
|
if os.path.exists(img_path) and os.path.getsize(img_path) > 500:
|
|
return f"images/{slug}.jpg"
|
|
|
|
try:
|
|
resp = img_session.get(img_url, timeout=10)
|
|
if resp.status_code == 200 and len(resp.content) > 500:
|
|
with open(img_path, "wb") as f:
|
|
f.write(resp.content)
|
|
print(f" Downloaded image: {slug}.jpg")
|
|
return f"images/{slug}.jpg"
|
|
else:
|
|
print(f" [WARN] No image for {plant_name} (HTTP {resp.status_code})")
|
|
return ""
|
|
except Exception as e:
|
|
print(f" [WARN] Image error for {plant_name}: {e}")
|
|
return ""
|
|
|
|
|
|
def extract_plants_from_html(html, month_num):
|
|
"""Parse first plant-list div only from raw HTML."""
|
|
MARKER = '<div class="plant-list">'
|
|
first = html.find(MARKER)
|
|
if first == -1:
|
|
print(" [ERROR] plant-list marker not found")
|
|
return []
|
|
second = html.find(MARKER, first + len(MARKER))
|
|
if second != -1:
|
|
html = html[:second]
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
plants = []
|
|
seen = set()
|
|
|
|
for tr in soup.find_all("tr"):
|
|
td = tr.find("td")
|
|
if not td:
|
|
continue
|
|
row_div = td.find("div", class_="row")
|
|
if not row_div:
|
|
continue
|
|
if "feed-ad" in row_div.get("class", []):
|
|
continue
|
|
|
|
name_col = None
|
|
for div in row_div.find_all("div", recursive=False):
|
|
if "col-md-6" in div.get("class", []):
|
|
name_col = div
|
|
break
|
|
if not name_col:
|
|
continue
|
|
|
|
link = name_col.find("a", href=re.compile(r"^/plant/[^/]+$"))
|
|
if not link:
|
|
continue
|
|
|
|
plant_name = link.get_text(strip=True)
|
|
href = link.get("href", "")
|
|
if not plant_name or plant_name in seen:
|
|
continue
|
|
|
|
action = ""
|
|
for div in row_div.find_all("div", recursive=False):
|
|
cls = div.get("class", [])
|
|
if "d-none" in cls and "d-md-block" in cls and "col-md" in cls:
|
|
action = div.get_text(strip=True)
|
|
break
|
|
if not action:
|
|
mob = name_col.find("div", class_="d-md-none")
|
|
if mob:
|
|
action = mob.get_text(strip=True)
|
|
|
|
aliases = []
|
|
italic = name_col.find("i")
|
|
if italic:
|
|
alias_text = italic.get_text(strip=True)
|
|
alias_text = re.sub(r"^also\s*,?\s*", "", alias_text, flags=re.IGNORECASE).strip()
|
|
if alias_text:
|
|
aliases = [a.strip() for a in alias_text.split(",") if a.strip()]
|
|
|
|
seen.add(plant_name)
|
|
plants.append({
|
|
"name": plant_name,
|
|
"aliases": aliases,
|
|
"action": action,
|
|
"href": href,
|
|
"detail_url": f"{BASE_URL}{href}?zone={ZONE_PARAM}",
|
|
})
|
|
|
|
return plants
|
|
|
|
|
|
def parse_month_page(driver, month_num):
|
|
print(f" Processing month {month_num} ({MONTHS[month_num]})")
|
|
clicked = False
|
|
|
|
# Strategy 1: anchor href containing m={month_num}
|
|
if not clicked:
|
|
try:
|
|
btns = driver.find_elements(By.CSS_SELECTOR, f"a[href*='m={month_num}']")
|
|
for btn in btns:
|
|
href = btn.get_attribute("href") or ""
|
|
if re.search(rf"[?&]m={month_num}(&|$)", href):
|
|
driver.execute_script("arguments[0].click();", btn)
|
|
clicked = True
|
|
print(f" Clicked href: {href}")
|
|
break
|
|
except Exception as e:
|
|
print(f" Strategy 1 failed: {e}")
|
|
|
|
# Strategy 2: month-selector element by text
|
|
if not clicked:
|
|
try:
|
|
abbrevs = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",
|
|
7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
|
|
sel = driver.find_element(By.CLASS_NAME, "month-selector")
|
|
btns = sel.find_elements(By.TAG_NAME, "a") + sel.find_elements(By.TAG_NAME, "button")
|
|
print(f" month-selector has {len(btns)} elements")
|
|
for btn in btns:
|
|
txt = btn.text.strip()
|
|
if txt == str(month_num) or txt.startswith(abbrevs[month_num]):
|
|
driver.execute_script("arguments[0].click();", btn)
|
|
clicked = True
|
|
print(f" Clicked text: '{txt}'")
|
|
break
|
|
except Exception as e:
|
|
print(f" Strategy 2 failed: {e}")
|
|
|
|
# Strategy 3: data-month or value attribute
|
|
if not clicked:
|
|
try:
|
|
btn = driver.find_element(By.CSS_SELECTOR,
|
|
f"[data-month='{month_num}'], [value='{month_num}']")
|
|
driver.execute_script("arguments[0].click();", btn)
|
|
clicked = True
|
|
print(f" Clicked data-month={month_num}")
|
|
except Exception as e:
|
|
print(f" Strategy 3 failed: {e}")
|
|
|
|
# Strategy 4: <select> element
|
|
if not clicked:
|
|
try:
|
|
from selenium.webdriver.support.ui import Select
|
|
sel_el = driver.find_element(By.CSS_SELECTOR,
|
|
"select[name='m'], select.month, #month-select")
|
|
sel_obj = Select(sel_el)
|
|
sel_obj.select_by_value(str(month_num))
|
|
clicked = True
|
|
print(f" Selected via <select>")
|
|
except Exception as e:
|
|
print(f" Strategy 4 failed: {e}")
|
|
|
|
if not clicked:
|
|
print(f" [ERROR] Could not click month {month_num}")
|
|
try:
|
|
sel = driver.find_element(By.CLASS_NAME, "month-selector")
|
|
print(f" Selector HTML:\n{sel.get_attribute('outerHTML')[:1000]}")
|
|
except:
|
|
print(" month-selector element not found")
|
|
return []
|
|
|
|
time.sleep(3)
|
|
html = driver.page_source
|
|
plants = extract_plants_from_html(html, month_num)
|
|
print(f" Found {len(plants)} plants in {MONTHS[month_num]}")
|
|
return plants
|
|
|
|
|
|
def parse_plant_detail(driver, plant_name, detail_url):
|
|
print(f" Detail: {plant_name}")
|
|
driver.get(detail_url)
|
|
try:
|
|
WebDriverWait(driver, 10).until(
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
|
|
except Exception:
|
|
pass
|
|
time.sleep(2)
|
|
|
|
soup = BeautifulSoup(driver.page_source, "lxml")
|
|
info = {
|
|
"description": "",
|
|
"spacing": "",
|
|
"height": "",
|
|
"row_spacing": "",
|
|
"harvest_time": "",
|
|
"pot_friendly": False,
|
|
"notes": "",
|
|
"compatible": [],
|
|
"avoid": [],
|
|
}
|
|
|
|
desc_el = (soup.find("div", class_="description") or
|
|
soup.find("div", class_="plant-description") or
|
|
soup.find("p", class_="description"))
|
|
if desc_el:
|
|
info["description"] = desc_el.get_text(" ", strip=True)
|
|
else:
|
|
main = soup.find("main") or soup.find("div", id="content") or soup
|
|
for p in main.find_all("p"):
|
|
text = p.get_text(strip=True)
|
|
if len(text) > 60:
|
|
info["description"] = text
|
|
break
|
|
|
|
full = soup.get_text(" ", strip=True)
|
|
|
|
def extract_after(text, label):
|
|
m = re.search(re.escape(label) + r"[:\s]*([^\n]{1,120})", text, re.IGNORECASE)
|
|
return m.group(1).strip() if m else ""
|
|
|
|
info["spacing"] = extract_after(full, "Spacing")
|
|
info["row_spacing"] = extract_after(full, "Row spacing")
|
|
info["height"] = extract_after(full, "Height")
|
|
info["harvest_time"] = extract_after(full, "Harvest")
|
|
|
|
pot = re.search(r"can grow in (pots|containers)[:\s]*([^\n]+)", full, re.IGNORECASE)
|
|
if pot:
|
|
info["pot_friendly"] = pot.group(2).strip().lower().startswith(("yes","true"))
|
|
|
|
notes_el = soup.find("div", class_="notes") or soup.find("div", class_="tips")
|
|
if notes_el:
|
|
info["notes"] = notes_el.get_text(" ", strip=True)
|
|
|
|
cm = re.search(r"(?:grows well with|compatible)[:\s]+([^\n.]+)", full, re.IGNORECASE)
|
|
if cm:
|
|
info["compatible"] = [x.strip() for x in cm.group(1).split(",") if x.strip()]
|
|
|
|
av = re.search(r"(?:avoid growing|keep away)[:\s]+([^\n.]+)", full, re.IGNORECASE)
|
|
if av:
|
|
info["avoid"] = [x.strip() for x in av.group(1).split(",") if x.strip()]
|
|
|
|
return info
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Gardenate Zone 8b Scraper (Selenium - Click Mode)")
|
|
print("=" * 60)
|
|
|
|
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
|
|
os.makedirs(IMG_DIR, exist_ok=True)
|
|
|
|
if os.path.exists(OUTPUT_FILE):
|
|
with open(OUTPUT_FILE, "r") as f:
|
|
try:
|
|
all_data = json.load(f)
|
|
print(f"Resuming — {len(all_data.get('plants', {}))} plants already saved.")
|
|
except json.JSONDecodeError:
|
|
all_data = {"zone": "USA - Zone 8b", "plants": {}}
|
|
else:
|
|
all_data = {"zone": "USA - Zone 8b", "plants": {}}
|
|
|
|
plants_db = all_data["plants"]
|
|
|
|
img_session = req_lib.Session()
|
|
img_session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; FarmGardenBot/1.0)"})
|
|
|
|
print("\nStarting Chrome (headless)...")
|
|
driver = make_driver()
|
|
|
|
try:
|
|
print(f"\nLoading base page...")
|
|
driver.get(f"{BASE_URL}/?zone={ZONE_PARAM}")
|
|
time.sleep(4)
|
|
|
|
try:
|
|
sel = driver.find_element(By.CLASS_NAME, "month-selector")
|
|
print(f"\nMonth selector HTML:\n{sel.get_attribute('outerHTML')[:2000]}\n")
|
|
except Exception as e:
|
|
print(f"Could not find month-selector: {e}")
|
|
|
|
# Step 1: Scan all 12 months
|
|
print("\n[Step 1] Scanning all 12 months...")
|
|
month_plant_map = {}
|
|
for month_num in range(1, 13):
|
|
month_plants = parse_month_page(driver, month_num)
|
|
month_plant_map[month_num] = month_plants
|
|
time.sleep(1)
|
|
|
|
# Step 2: Build plant database + download images
|
|
print("\n[Step 2] Building plant database + downloading images...")
|
|
for month_num, month_plants in month_plant_map.items():
|
|
for p in month_plants:
|
|
slug = slugify(p["name"])
|
|
if slug not in plants_db:
|
|
href = p.get("href", f"/plant/{slug}")
|
|
img_path = download_image(img_session, p["name"], href, IMG_DIR)
|
|
plants_db[slug] = {
|
|
"name": p["name"],
|
|
"aliases": p["aliases"],
|
|
"image": img_path,
|
|
"description": "",
|
|
"spacing": "",
|
|
"height": "",
|
|
"row_spacing": "",
|
|
"harvest_time": "",
|
|
"pot_friendly": False,
|
|
"notes": "",
|
|
"compatible": [],
|
|
"avoid": [],
|
|
"months": {},
|
|
"detail_url": p["detail_url"],
|
|
}
|
|
time.sleep(0.5)
|
|
|
|
plants_db[slug]["months"][str(month_num)] = p["action"] if p["action"] else "Plant"
|
|
if p["aliases"] and not plants_db[slug]["aliases"]:
|
|
plants_db[slug]["aliases"] = p["aliases"]
|
|
|
|
with open(OUTPUT_FILE, "w") as f:
|
|
json.dump(all_data, f, indent=2)
|
|
print(f" Saved {len(plants_db)} plants to JSON")
|
|
|
|
# Step 3: Fetch plant details
|
|
print(f"\n[Step 3] Fetching details for {len(plants_db)} plants...")
|
|
for i, (slug, plant) in enumerate(plants_db.items(), 1):
|
|
if plant.get("description") and len(plant["description"]) > 20:
|
|
print(f" [{i}/{len(plants_db)}] Skipping {plant['name']} (cached)")
|
|
continue
|
|
|
|
detail_url = plant.get("detail_url", f"{BASE_URL}/plant/{slug}?zone={ZONE_PARAM}")
|
|
details = parse_plant_detail(driver, plant["name"], detail_url)
|
|
plants_db[slug].update(details)
|
|
|
|
with open(OUTPUT_FILE, "w") as f:
|
|
json.dump(all_data, f, indent=2)
|
|
|
|
time.sleep(DELAY)
|
|
|
|
finally:
|
|
driver.quit()
|
|
|
|
with open(OUTPUT_FILE, "w") as f:
|
|
json.dump(all_data, f, indent=2)
|
|
|
|
print(f"\n[Done] Saved {len(plants_db)} plants to {OUTPUT_FILE}")
|
|
print(f"Images saved to: {IMG_DIR}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |