Compare commits

...

8 Commits

Author SHA1 Message Date
CyberMind
051ca6d1d7
Merge pull request #660 from CyberMind-FR/feature/659-feat-toolbox-per-visitor-ad-block-breakd
Some checks are pending
License Headers / check (push) Waiting to run
feat(toolbox): per-visitor ad-block breakdown in #ads (top visitors + per-visitor drill-down)
2026-06-18 14:13:41 +02:00
69659f6a67 chore(toolbox): changelog 2.6.58 for per-visitor ad breakdown (ref #659) 2026-06-18 14:13:16 +02:00
1bd5108472 feat(toolbox #659): #ads tab — top visitors + per-visitor drill-down
renders d.top_visitors into #ads-visitors; clickable mac_hash calls
loadAdsClient → /admin/ad-stats/client/{mac_hash} into #ads-client-detail.
all values escaped.
2026-06-18 14:10:00 +02:00
6c96ba62e4 feat(toolbox #659): GET /admin/ad-stats/client/{mac_hash} drill-down
per-visitor ad-block detail endpoint, hours clamped 1..168.
2026-06-18 14:09:22 +02:00
4d0cbf8b7f feat(toolbox #659): ad_ghost accumulates blocked ads per visitor
guarded mac_hash_of import; _cli hot-path dict; per-visitor tally on the
204 block branch; drained + offloaded to record_ad_client_blocks in _flush.
hot path stays in-memory increments only.
2026-06-18 14:08:59 +02:00
4f96da87d7 feat(toolbox #659): store per-visitor ad-block breakdown
ad_block_client_host table + record_ad_client_blocks upsert +
ad_client_stats drill-down; ad_stats now returns top_visitors.
2026-06-18 14:07:02 +02:00
CyberMind
2b036db0d6
Merge pull request #658 from CyberMind-FR/fix/ad-learn-self-block-hardening
fix(toolbox #658): ad-learn never self-blocks own infra + exact-host promotion
2026-06-18 13:55:47 +02:00
376b4ecd2a fix(toolbox #658): ad-learn never self-blocks own infra + exact-host promotion (no registrable over-fold) 2026-06-18 13:53:42 +02:00
10 changed files with 388 additions and 3 deletions

View File

@ -1,3 +1,30 @@
secubox-toolbox (2.6.58-1~bookworm1) bookworm; urgency=medium
* feat(#659): per-visitor ad-block breakdown in #ads. ad_ghost now also tallies
the visitor (mac_hash_of(client_ip), cached WG-hash for R3 — no blocking) per
blocked ad into a new `ad_block_client_host` table (hot-path = dict increments,
bg-thread flush). `/admin/ad-stats` gains `top_visitors`; new
`/admin/ad-stats/client/{mac_hash}` for the drill-down. #ads tab shows a
"Top visiteurs (pubs bloquées)" table; click a visitor → their top ad hosts.
Bounded (clients×hosts); own-infra guard (#658) keeps our own hosts out.
-- Gerald KERMA <devel@cybermind.fr> Thu, 18 Jun 2026 20:00:00 +0200
secubox-toolbox (2.6.57-1~bookworm1) bookworm; urgency=medium
* fix(#658): ad-learn hardening — never self-block own infrastructure.
- ad_ghost `_allowed` now ALWAYS allows the appliance's own domains
(`_SELF_REGS`, default {secubox.in}, env `SECUBOX_SELF_DOMAINS`) — hard-coded
so it survives a reflash with no allowlist file; this also stops own-infra
from ever being captured as a candidate (early-return).
- autolearn `_ad_feed` excludes own-infra AND promotes the EXACT candidate
host instead of its registrable — so a tracker subdomain
(analytics.tiktok.com) no longer blocks the parent site (tiktok.com).
Root cause: the aggressive learner self-promoted secubox.in → 204'd all
*.secubox.in for R3; live-mitigated via allowlist, now fixed at source.
-- Gerald KERMA <devel@cybermind.fr> Thu, 18 Jun 2026 19:00:00 +0200
secubox-toolbox (2.6.56-1~bookworm1) bookworm; urgency=medium secubox-toolbox (2.6.56-1~bookworm1) bookworm; urgency=medium
* feat(#656): Ad Intelligence — learn · act · measure. * feat(#656): Ad Intelligence — learn · act · measure.

View File

@ -41,6 +41,12 @@ try:
except Exception: # pragma: no cover except Exception: # pragma: no cover
_store = None _store = None
# #659 — resolve client IP → stable per-visitor identity hash (best-effort).
try:
from _common import mac_hash_of # noqa: E402
except Exception: # pragma: no cover
mac_hash_of = None
_executor = concurrent.futures.ThreadPoolExecutor( _executor = concurrent.futures.ThreadPoolExecutor(
max_workers=1, thread_name_prefix="sbx_ad") max_workers=1, thread_name_prefix="sbx_ad")
@ -50,6 +56,12 @@ _EST_BYTES_PER_REQ = 45000 # honest estimate per blocked ad/tracker request
# #656 — operator allowlist (host or registrable, one per line, # comments). # #656 — operator allowlist (host or registrable, one per line, # comments).
# Allowlist ALWAYS wins: an allowlisted host is never 204'd nor recorded. # Allowlist ALWAYS wins: an allowlisted host is never 204'd nor recorded.
_ALLOW_PATH = "/var/lib/secubox/toolbox/ad-allowlist.txt" _ALLOW_PATH = "/var/lib/secubox/toolbox/ad-allowlist.txt"
# #658 — the appliance's OWN domains. NEVER blocked/learned (the aggressive
# learner once self-promoted secubox.in → 204'd all *.secubox.in for R3).
# Hard-coded (env-overridable) so it survives a reflash with no allowlist file.
_SELF_REGS = {d.strip().lower() for d in
os.environ.get("SECUBOX_SELF_DOMAINS", "secubox.in").split(",")
if d.strip()}
# Path heuristics for 3rd-party ad/track candidate capture (learning only). # Path heuristics for 3rd-party ad/track candidate capture (learning only).
_AD_PATH = re.compile(r"/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|" _AD_PATH = re.compile(r"/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|"
r"/pixel|/collect|/track(ing)?|/telemetry|/metric", re.I) r"/pixel|/collect|/track(ing)?|/telemetry|/metric", re.I)
@ -57,6 +69,7 @@ _AD_PATH = re.compile(r"/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|"
# Hot-path dict increments only; drained + offloaded to SQLite in _flush. # Hot-path dict increments only; drained + offloaded to SQLite in _flush.
_ctx: dict = {} # (host, site, action) -> [hits, bytes] _ctx: dict = {} # (host, site, action) -> [hits, bytes]
_cand: dict = {} # (host, site) -> hits _cand: dict = {} # (host, site) -> hits
_cli: dict = {} # #659 (mac_hash, ad_host) -> [hits, bytes]
_allow: set = set() _allow: set = set()
_allow_mtime = 0.0 _allow_mtime = 0.0
@ -109,6 +122,10 @@ def _allowed(host: str) -> bool:
pass pass
h = (host or "").lower() h = (host or "").lower()
reg = _registrable(h) or h reg = _registrable(h) or h
# #658 — own infra always allowed (never block/capture our own domains),
# independent of the allowlist file (reflash-safe).
if reg in _SELF_REGS or any(h == d or h.endswith("." + d) for d in _SELF_REGS):
return True
return h in _allow or reg in _allow return h in _allow or reg in _allow
@ -197,16 +214,20 @@ def _flush(force: bool = False) -> None:
# thread, so the proxy event loop never touches the DB. Snapshot+clear # thread, so the proxy event loop never touches the DB. Snapshot+clear
# under no lock is fine: CPython dict ops are atomic and a missed increment # under no lock is fine: CPython dict ops are atomic and a missed increment
# between snapshot and clear is harmless (stats, not security). # between snapshot and clear is harmless (stats, not security).
if _store is not None and (_ctx or _cand): if _store is not None and (_ctx or _cand or _cli):
try: try:
rows = [(h, s, a, v[0], v[1]) for (h, s, a), v in _ctx.items()] rows = [(h, s, a, v[0], v[1]) for (h, s, a), v in _ctx.items()]
cand_rows = [(h, s, n) for (h, s), n in _cand.items()] cand_rows = [(h, s, n) for (h, s), n in _cand.items()]
cli_rows = [(mh, h, v[0], v[1]) for (mh, h), v in _cli.items()]
_ctx.clear() _ctx.clear()
_cand.clear() _cand.clear()
_cli.clear()
if rows: if rows:
_executor.submit(_store.record_ad_blocks, rows) _executor.submit(_store.record_ad_blocks, rows)
if cand_rows: if cand_rows:
_executor.submit(_store.record_ad_candidates, cand_rows) _executor.submit(_store.record_ad_candidates, cand_rows)
if cli_rows:
_executor.submit(_store.record_ad_client_blocks, cli_rows)
except Exception: except Exception:
pass pass
@ -261,6 +282,20 @@ class AdGhost:
_ctx[k] = v _ctx[k] = v
except Exception: except Exception:
pass pass
# #659 — per-visitor breakdown: resolve the client identity and
# tally this blocked ad host against it. Dict increment only.
try:
if mac_hash_of is not None and len(_cli) < 50000:
ip = flow.client_conn.peername[0] if flow.client_conn.peername else None
mh = mac_hash_of(ip) if ip else None
if mh:
ck = (mh, host)
cv = _cli.get(ck) or [0, 0]
cv[0] += 1
cv[1] += _EST_BYTES_PER_REQ
_cli[ck] = cv
except Exception:
pass
_flush() _flush()
elif f.get("ad_learn", True) and site: elif f.get("ad_learn", True) and site:
# #656 — aggressive candidate capture: 3rd-party request whose path # #656 — aggressive candidate capture: 3rd-party request whose path

View File

@ -176,6 +176,11 @@ def _ad_feed() -> int:
sys.stderr.write(f"autolearn: ad query failed: {e}\n") sys.stderr.write(f"autolearn: ad query failed: {e}\n")
return -1 return -1
allow = _load_ad_allowlist() allow = _load_ad_allowlist()
# #658 — never promote the appliance's own domains (the learner once
# self-promoted secubox.in). Hard default + env-overridable.
self_doms = {d.strip().lower() for d in
os.environ.get("SECUBOX_SELF_DOMAINS", "secubox.in").split(",")
if d.strip()}
promoted: set = set() promoted: set = set()
for r in rows: for r in rows:
h = (r[0] or "").lower().strip(".") h = (r[0] or "").lower().strip(".")
@ -184,7 +189,12 @@ def _ad_feed() -> int:
reg = registrable(h) or h reg = registrable(h) or h
if h in allow or reg in allow: if h in allow or reg in allow:
continue continue
promoted.add(reg) if reg in self_doms or any(h == d or h.endswith("." + d) for d in self_doms):
continue
# #658 — promote the EXACT host, NOT the registrable: blocking a tracker
# subdomain (analytics.tiktok.com) must never block the parent site
# (tiktok.com). Dedicated ad hosts are already registrable-level.
promoted.add(h)
if not promoted: if not promoted:
return 0 return 0
# MERGE with existing learned-trackers.txt (union, dedup, cap). # MERGE with existing learned-trackers.txt (union, dedup, cap).

View File

@ -2436,6 +2436,13 @@ async def admin_ad_stats(hours: int = 24) -> dict:
return store.ad_stats(hours=h) return store.ad_stats(hours=h)
@router.get("/admin/ad-stats/client/{mac_hash}")
async def admin_ad_stats_client(mac_hash: str, hours: int = 24) -> dict:
"""#659 — one visitor's ad-block drill-down (read-only)."""
h = max(1, min(int(hours if hours is not None else 24), 168))
return store.ad_client_stats(mac_hash, hours=h)
@router.get("/admin/ghost") @router.get("/admin/ghost")
async def admin_ghost() -> dict: async def admin_ghost() -> dict:
"""#566 — ad/banner ghoster savings (R3+/R4). Read-only counters.""" """#566 — ad/banner ghoster savings (R3+/R4). Read-only counters."""

View File

@ -58,6 +58,10 @@ CREATE TABLE IF NOT EXISTS ad_block_stats (
CREATE TABLE IF NOT EXISTS ad_candidates ( CREATE TABLE IF NOT EXISTS ad_candidates (
host TEXT, site TEXT, hits INTEGER NOT NULL DEFAULT 0, last_seen REAL, host TEXT, site TEXT, hits INTEGER NOT NULL DEFAULT 0, last_seen REAL,
PRIMARY KEY (host, site)); PRIMARY KEY (host, site));
CREATE TABLE IF NOT EXISTS ad_block_client_host (
mac_hash TEXT, ad_host TEXT, hits INTEGER NOT NULL DEFAULT 0,
bytes INTEGER NOT NULL DEFAULT 0, last_seen REAL,
PRIMARY KEY (mac_hash, ad_host));
""" """
@ -86,6 +90,43 @@ def record_ad_blocks(rows) -> None:
log.debug("record_ad_blocks failed: %s", e) log.debug("record_ad_blocks failed: %s", e)
def record_ad_client_blocks(rows) -> None:
"""rows: iterable of (mac_hash, ad_host, hits, bytes). Per-visitor ad-block
breakdown (#659). Batch upsert. Skips rows with empty mac_hash."""
rows = [r for r in rows if r and r[0]]
if not rows:
return
now = time.time()
try:
with _conn() as c:
c.executemany(
"INSERT INTO ad_block_client_host(mac_hash,ad_host,hits,bytes,last_seen) "
"VALUES(?,?,?,?,?) ON CONFLICT(mac_hash,ad_host) DO UPDATE SET "
"hits=hits+excluded.hits, bytes=bytes+excluded.bytes, last_seen=excluded.last_seen",
[(mh, h or "", int(n), int(b), now) for (mh, h, n, b) in rows])
except Exception as e:
log.debug("record_ad_client_blocks failed: %s", e)
def ad_client_stats(mac_hash: str, hours: int = 24, top: int = 25) -> dict:
"""One visitor's top ad hosts blocked, within the time window (#659)."""
cutoff = time.time() - hours * 3600
out = {"mac_hash": mac_hash, "total": 0, "top_hosts": []}
try:
with _conn() as c:
r = c.execute(
"SELECT SUM(hits) FROM ad_block_client_host WHERE mac_hash=? AND last_seen>=?",
(mac_hash, cutoff)).fetchone()
out["total"] = int((r and r[0]) or 0)
out["top_hosts"] = [{"host": h, "hits": int(n), "bytes": int(b or 0)} for h, n, b in c.execute(
"SELECT ad_host, SUM(hits), SUM(bytes) FROM ad_block_client_host "
"WHERE mac_hash=? AND last_seen>=? GROUP BY ad_host ORDER BY SUM(hits) DESC LIMIT ?",
(mac_hash, cutoff, top))]
except Exception as e:
log.debug("ad_client_stats failed: %s", e)
return out
def record_ad_candidates(rows) -> None: def record_ad_candidates(rows) -> None:
"""rows: iterable of (host, site, hits).""" """rows: iterable of (host, site, hits)."""
rows = [r for r in rows if r and r[0]] rows = [r for r in rows if r and r[0]]
@ -117,7 +158,8 @@ def ad_candidate_sites(min_sites: int = 1, max_hosts: int = 5000) -> list:
def ad_stats(hours: int = 24, top: int = 25) -> dict: def ad_stats(hours: int = 24, top: int = 25) -> dict:
cutoff = time.time() - hours * 3600 cutoff = time.time() - hours * 3600
out = {"window_hours": hours, "total_blocked": 0, "total_bytes": 0, out = {"window_hours": hours, "total_blocked": 0, "total_bytes": 0,
"by_action": {"block": 0, "silent": 0}, "top_hosts": [], "top_sites": []} "by_action": {"block": 0, "silent": 0}, "top_hosts": [], "top_sites": [],
"top_visitors": []}
try: try:
with _conn() as c: with _conn() as c:
for action, hits in c.execute( for action, hits in c.execute(
@ -134,6 +176,10 @@ def ad_stats(hours: int = 24, top: int = 25) -> dict:
out["top_sites"] = [{"site": s, "hits": int(n)} for s, n in c.execute( out["top_sites"] = [{"site": s, "hits": int(n)} for s, n in c.execute(
"SELECT site, SUM(hits) FROM ad_block_stats WHERE action='block' AND last_seen>=? AND site<>'' " "SELECT site, SUM(hits) FROM ad_block_stats WHERE action='block' AND last_seen>=? AND site<>'' "
"GROUP BY site ORDER BY SUM(hits) DESC LIMIT ?", (cutoff, top))] "GROUP BY site ORDER BY SUM(hits) DESC LIMIT ?", (cutoff, top))]
out["top_visitors"] = [{"mac_hash": mh, "hits": int(n)} for mh, n in c.execute(
"SELECT mac_hash, SUM(hits) FROM ad_block_client_host "
"WHERE last_seen>=? AND mac_hash<>'' GROUP BY mac_hash "
"ORDER BY SUM(hits) DESC LIMIT ?", (cutoff, top))]
except Exception as e: except Exception as e:
log.debug("ad_stats failed: %s", e) log.debug("ad_stats failed: %s", e)
return out return out

View File

@ -0,0 +1,39 @@
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
# Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
"""Tests for GET /admin/ad-stats/client/{mac_hash} (#659)."""
import asyncio
from secubox_toolbox import api, store
_CANNED = {
"mac_hash": "MH_FIXED",
"total": 7,
"top_hosts": [{"host": "ads.example.com", "hits": 5, "bytes": 225000},
{"host": "px.tracker.io", "hits": 2, "bytes": 90000}],
}
def test_ad_stats_client_returns_store_data(monkeypatch):
monkeypatch.setattr(store, "ad_client_stats",
lambda mac_hash, hours=24, **kw: dict(_CANNED))
result = asyncio.run(api.admin_ad_stats_client("MH_FIXED", hours=24))
assert result["mac_hash"] == "MH_FIXED"
assert result["total"] == 7
assert result["top_hosts"][0]["host"] == "ads.example.com"
def test_ad_stats_client_clamps_hours(monkeypatch):
captured = {}
def fake(mac_hash, hours=24, **kw):
captured["hours"] = hours
captured["mac_hash"] = mac_hash
return dict(_CANNED)
monkeypatch.setattr(store, "ad_client_stats", fake)
asyncio.run(api.admin_ad_stats_client("MH", hours=0))
assert captured["hours"] == 1
assert captured["mac_hash"] == "MH"
asyncio.run(api.admin_ad_stats_client("MH", hours=9999))
assert captured["hours"] == 168

View File

@ -0,0 +1,63 @@
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
# Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
"""Tests for per-visitor ad-block breakdown store (#659)."""
from pathlib import Path
from secubox_toolbox import store
def _fresh(tmp_path, mp):
mp.setattr(store, "DB_PATH", Path(tmp_path) / "t.db")
def test_record_ad_client_blocks_accumulates(tmp_path, monkeypatch):
_fresh(tmp_path, monkeypatch)
store.record_ad_client_blocks([
("mh_a", "ads.example.com", 3, 3 * 45000),
("mh_a", "px.tracker.io", 1, 45000),
])
store.record_ad_client_blocks([("mh_a", "ads.example.com", 2, 2 * 45000)])
c = store.ad_client_stats("mh_a", hours=24)
assert c["mac_hash"] == "mh_a"
assert c["total"] == 6
hosts = {r["host"]: r for r in c["top_hosts"]}
assert hosts["ads.example.com"]["hits"] == 5
assert hosts["ads.example.com"]["bytes"] == 5 * 45000
assert hosts["px.tracker.io"]["hits"] == 1
def test_record_ad_client_blocks_skips_empty_mac(tmp_path, monkeypatch):
_fresh(tmp_path, monkeypatch)
store.record_ad_client_blocks([
("", "ads.example.com", 5, 5 * 45000),
("mh_b", "ads.example.com", 2, 2 * 45000),
])
s = store.ad_stats(hours=24)
macs = {r["mac_hash"] for r in s["top_visitors"]}
assert "" not in macs
assert macs == {"mh_b"}
def test_ad_stats_top_visitors_ranked(tmp_path, monkeypatch):
_fresh(tmp_path, monkeypatch)
store.record_ad_client_blocks([
("mh_busy", "ads.example.com", 10, 0),
("mh_busy", "px.tracker.io", 5, 0),
("mh_quiet", "ads.example.com", 2, 0),
])
s = store.ad_stats(hours=24)
tv = s["top_visitors"]
assert tv[0]["mac_hash"] == "mh_busy" and tv[0]["hits"] == 15
assert tv[1]["mac_hash"] == "mh_quiet" and tv[1]["hits"] == 2
def test_ad_client_stats_window(tmp_path, monkeypatch):
_fresh(tmp_path, monkeypatch)
store.record_ad_client_blocks([("mh_c", "ads.example.com", 4, 0)])
# within window
assert store.ad_client_stats("mh_c", hours=24)["total"] == 4
# zero-hour window (cutoff in the future) → nothing
out = store.ad_client_stats("mh_c", hours=0)
assert out["mac_hash"] == "mh_c"
assert out["total"] == 0
assert out["top_hosts"] == []

View File

@ -0,0 +1,76 @@
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
# Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
"""Tests for ad_ghost per-visitor accumulation on the block path (#659)."""
import time
import types
import pytest
from mitmproxy_addons import ad_ghost
def _flow(host, path="/", peer="10.99.1.5"):
req = types.SimpleNamespace(
pretty_host=host,
path=path,
headers=types.SimpleNamespace(get=lambda k, d="": d),
)
return types.SimpleNamespace(
request=req,
client_conn=types.SimpleNamespace(peername=(peer, 0)),
response=None,
)
@pytest.fixture(autouse=True)
def _reset(monkeypatch, tmp_path):
ad_ghost._ctx.clear()
ad_ghost._cand.clear()
ad_ghost._cli.clear()
ad_ghost._allow = set()
ad_ghost._allow_mtime = 0.0
monkeypatch.setattr(ad_ghost, "_ALLOW_PATH", str(tmp_path / "ad-allowlist.txt"))
monkeypatch.setattr(ad_ghost, "get_filters", lambda force=False: {
"ad_ghost": 1, "ad_ghost_block": 1, "ad_learn": 1, "autolearn": 1,
"ad_ghost_categories": {},
})
monkeypatch.setattr(ad_ghost, "mac_hash_of", lambda ip: "MH_FIXED")
# Freeze the 5s flush gate so _flush() early-returns and never drains/clears
# the hot-path dicts before we assert on them.
monkeypatch.setattr(ad_ghost, "_last_flush", time.time())
yield
ad_ghost._ctx.clear()
ad_ghost._cand.clear()
ad_ghost._cli.clear()
# These tests issue 204 blocks → reset the cumulative counter so we don't
# pollute any later-collected test that asserts on _counts.
ad_ghost._counts["blocked_requests"] = 0
def test_blocked_ad_recorded_per_visitor():
flow = _flow("ad.doubleclick.net", path="/gampad/ads")
ad_ghost.AdGhost().requestheaders(flow)
assert flow.response is not None and flow.response.status_code == 204
ck = ("MH_FIXED", "ad.doubleclick.net")
assert ck in ad_ghost._cli
assert ad_ghost._cli[ck][0] == 1
assert ad_ghost._cli[ck][1] == ad_ghost._EST_BYTES_PER_REQ
def test_visitor_accumulates_across_requests():
g = ad_ghost.AdGhost()
g.requestheaders(_flow("ad.doubleclick.net", path="/a"))
g.requestheaders(_flow("ad.doubleclick.net", path="/b"))
assert ad_ghost._cli[("MH_FIXED", "ad.doubleclick.net")][0] == 2
def test_no_visitor_record_when_mac_hash_unavailable(monkeypatch):
monkeypatch.setattr(ad_ghost, "mac_hash_of", lambda ip: None)
ad_ghost.AdGhost().requestheaders(_flow("ad.doubleclick.net", path="/x"))
assert ad_ghost._cli == {}
def test_non_ad_host_not_recorded_per_visitor():
# not an ad host, not learned → no block, no per-visitor record
ad_ghost.AdGhost().requestheaders(_flow("static.cnn.com", path="/img.png"))
assert ad_ghost._cli == {}

View File

@ -0,0 +1,57 @@
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
"""#658 — ad-learn must never self-block own infra; promote exact host not registrable."""
import sys
import pathlib
import importlib
import sqlite3
import importlib.util
ADDON_DIR = pathlib.Path(__file__).resolve().parents[1] / "mitmproxy_addons"
sys.path.insert(0, str(ADDON_DIR))
def test_allowed_never_blocks_own_infra(monkeypatch, tmp_path):
# point the allowlist file at a nonexistent path → only the hard-coded
# _SELF_REGS guard can allow secubox.in
import ad_ghost
importlib.reload(ad_ghost)
monkeypatch.setattr(ad_ghost, "_ALLOW_PATH", str(tmp_path / "nope.txt"))
assert ad_ghost._allowed("admin.gk2.secubox.in") is True # own infra
assert ad_ghost._allowed("kbin.gk2.secubox.in") is True
assert ad_ghost._allowed("secubox.in") is True
assert ad_ghost._allowed("ads.doubleclick.net") is False # real ad host still blockable
def _load_autolearn():
p = pathlib.Path(__file__).resolve().parents[1] / "sbin" / "secubox-toolbox-autolearn"
spec = importlib.util.spec_from_loader("autolearn_h", loader=None)
mod = importlib.util.module_from_spec(spec)
exec(compile(p.read_text(), str(p), "exec"), mod.__dict__)
return mod
def test_ad_feed_exact_host_and_excludes_self(tmp_path, monkeypatch):
db = tmp_path / "t.db"
con = sqlite3.connect(db)
con.executescript(
"CREATE TABLE ad_candidates(host TEXT, site TEXT, hits INT, last_seen REAL, PRIMARY KEY(host,site));"
# analytics.tiktok.com seen on 2 sites → promote EXACT, not tiktok.com
"INSERT INTO ad_candidates VALUES('analytics.tiktok.com','a.com',1,0);"
"INSERT INTO ad_candidates VALUES('analytics.tiktok.com','b.com',1,0);"
# our own admin host seen on 2 sites → must NOT promote
"INSERT INTO ad_candidates VALUES('admin.gk2.secubox.in','a.com',1,0);"
"INSERT INTO ad_candidates VALUES('admin.gk2.secubox.in','b.com',1,0);")
con.commit(); con.close()
out = tmp_path / "learned.txt"
monkeypatch.setenv("SECUBOX_AUTOLEARN_DB", str(db))
monkeypatch.setenv("SECUBOX_AUTOLEARN_OUT", str(out))
monkeypatch.setenv("SECUBOX_AD_ALLOWLIST", str(tmp_path / "allow.txt"))
monkeypatch.setenv("SECUBOX_AD_MIN_SITES", "2")
monkeypatch.setenv("SECUBOX_SELF_DOMAINS", "secubox.in")
al = _load_autolearn()
al._ad_feed()
learned = set(out.read_text().split())
assert "analytics.tiktok.com" in learned # exact host promoted
assert "tiktok.com" not in learned # NOT the parent site
assert "admin.gk2.secubox.in" not in learned # own infra excluded
assert "secubox.in" not in learned

View File

@ -184,6 +184,11 @@
<h2>🌐 Top sites visités</h2> <h2>🌐 Top sites visités</h2>
<div id="ads-sites"><div class="empty">loading…</div></div> <div id="ads-sites"><div class="empty">loading…</div></div>
</div> </div>
<div class="card" style="grid-column:1/-1">
<h2>👤 Top visiteurs (pubs bloquées)</h2>
<div id="ads-visitors"><div class="empty">loading…</div></div>
<div id="ads-client-detail"></div>
</div>
</div> </div>
</section> </section>
@ -523,6 +528,26 @@ async function loadAds() {
document.getElementById('ads-sites').innerHTML = siteRows document.getElementById('ads-sites').innerHTML = siteRows
? '<table><thead><tr><th>Site</th><th>pubs bloquées</th></tr></thead><tbody>'+siteRows+'</tbody></table>' ? '<table><thead><tr><th>Site</th><th>pubs bloquées</th></tr></thead><tbody>'+siteRows+'</tbody></table>'
: ''; : '';
const visRows = (d.top_visitors||[]).map(r=>{
const mh = esc(r.mac_hash);
return `<tr><td><a href="#" onclick="loadAdsClient('${mh}');return false;"><code>${mh}</code></a></td><td>${r.hits}</td></tr>`;
}).join('');
document.getElementById('ads-visitors').innerHTML = visRows
? '<table><thead><tr><th>Visiteur</th><th>pubs bloquées</th></tr></thead><tbody>'+visRows+'</tbody></table>'
: '<div class="empty">aucun visiteur dans la fenêtre</div>';
}
async function loadAdsClient(mh) {
const esc = s => String(s).replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
const detail = document.getElementById('ads-client-detail');
detail.innerHTML = '<div class="empty">loading…</div>';
const d = await J('/admin/ad-stats/client/'+encodeURIComponent(mh));
if (!d || d.__error) { detail.innerHTML = `<div class="empty">${(d&&d.__error)||'no data'}</div>`; return; }
const rows = (d.top_hosts||[]).map(r=>`<tr><td><code>${esc(r.host)}</code></td><td>${r.hits}</td><td>${Math.round((r.bytes||0)/1024)}</td></tr>`).join('');
detail.innerHTML = `<h3>👤 <code>${esc(d.mac_hash)}</code> — ${d.total||0} pubs bloquées</h3>`
+ (rows
? '<table><thead><tr><th>Ad host</th><th>bloqués</th><th>Ko</th></tr></thead><tbody>'+rows+'</tbody></table>'
: '<div class="empty">aucune pub bloquée pour ce visiteur</div>');
} }
async function refreshAll() { async function refreshAll() {