Monorepo: consolidate 7 repos into one
Combines shared, blog, market, cart, events, federation, and account into a single repository. Eliminates submodule sync, sibling model copying at build time, and per-app CI orchestration. Changes: - Remove per-app .git, .gitmodules, .gitea, submodule shared/ dirs - Remove stale sibling model copies from each app - Update all 6 Dockerfiles for monorepo build context (root = .) - Add build directives to docker-compose.yml - Add single .gitea/workflows/ci.yml with change detection - Add .dockerignore for monorepo build context - Create __init__.py for federation and account (cross-app imports)
This commit is contained in:
53
market/scrape/product/helpers/html.py
Normal file
53
market/scrape/product/helpers/html.py
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from shared.config import config
|
||||
|
||||
def first_from_srcset(val: str) -> Optional[str]:
|
||||
if not val:
|
||||
return None
|
||||
first = val.split(",")[0].strip()
|
||||
parts = first.split()
|
||||
return parts[0] if parts else first
|
||||
|
||||
def abs_url(u: Optional[str]) -> Optional[str]:
|
||||
if not u:
|
||||
return None
|
||||
return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
|
||||
|
||||
def collect_img_candidates(el) -> List[str]:
|
||||
urls: List[str] = []
|
||||
if not el:
|
||||
return urls
|
||||
attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
|
||||
for a in attrs:
|
||||
v = el.get(a)
|
||||
if v:
|
||||
urls.append(v)
|
||||
for a in ["srcset", "data-srcset"]:
|
||||
v = el.get(a)
|
||||
if v:
|
||||
first = first_from_srcset(v)
|
||||
if first:
|
||||
urls.append(first)
|
||||
return urls
|
||||
|
||||
def _filename_key(u: str) -> str:
|
||||
p = urlparse(u)
|
||||
path = p.path or ""
|
||||
if path.endswith("/"):
|
||||
path = path[:-1]
|
||||
last = path.split("/")[-1]
|
||||
return f"{p.netloc}:{last}".lower()
|
||||
|
||||
def dedup_by_filename(urls: List[str]) -> List[str]:
|
||||
seen = set()
|
||||
out: List[str] = []
|
||||
for u in urls:
|
||||
k = _filename_key(u)
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
out.append(u)
|
||||
return out
|
||||
Reference in New Issue
Block a user