Compare commits

..

3 Commits

Author SHA1 Message Date
CyberMind
78ad554ece
Merge pull request #679 from CyberMind-FR/feat/662-adlearn
Some checks are pending
License Headers / check (push) Waiting to run
feat(#662): restore + strengthen ad/tracker auto-learn (live-reload, candidate emit, cross-site promotion)
2026-06-19 10:40:07 +02:00
895356dc00 chore: changelog 0.1.12 — ad auto-learn loop + live-reload (ref #662) 2026-06-19 10:37:49 +02:00
4063ae1a95 feat(toolbox): restore + strengthen ad/tracker auto-learn loop (ref #662)
The #662 Go cutover blocked from STATIC lists but never (1) emitted learning
candidates nor (2) live-reloaded the lists, so new adwares slipped through
forever and even autolearn promotions needed a worker restart. Restore the
full loop: feeders/outsiders -> lock to blocklist -> silence (204) -> smog
(poison) -> statistify, fed by BOTH the ad-path heuristic AND cross-site
cookie reuse (the social graph).

Go (packages/secubox-toolbox-ng):
- policy.go: mtime-based live-reload (Part 1, linchpin). Policy now holds the
  backing file paths + per-file last-mtime; maybeReload() (throttled ~15s)
  re-stats each file and atomically swaps the changed map under an RWMutex.
  Decide/shouldPoison take the read lock; allowedSafe() is the lock-taking
  entry for the candidate feed. Covers learned-trackers + ad-allowlist +
  splice seed/learned + pure-trackers. Promotions/edits now take effect with
  NO worker restart.
- adstats.go: ad-candidate learning feed (Part 2). Ports ad_ghost._AD_PATH
  (RE2) + a (host,site)->hits aggregator (cap 20k), drained into the existing
  ad-event payload's new "candidates" list by the same 10s flusher.
- main.go: maybeRecordAdCandidate() on the allow/mitm branch — 3rd-party
  (registrable(host) != registrable(site)) AND _AD_PATH match, gated behind
  the analysis relay flag, O(1) fire-and-forget.

Python (packages/secubox-toolbox):
- api.py: /__toolbox/ad-event now ingests "candidates" ->
  store.record_ad_candidates(); capped, try/except, never 500s.
- secubox-toolbox-autolearn: new _social_feed() promotes any cross-site
  cookie-reuse tracker (>= SECUBOX_SOCIAL_MIN_SITES distinct src_site in a
  recent window) from social_edges into learned-trackers.txt, reusing the
  _ad_feed allowlist/self guard and merge/de-dup.

Smog: confirmed isTracker() already consults the live-reloaded learned set
(blockedByAd), so a promoted cross-site tracker is poisoned automatically once
the policy reloads it — no new poison code.

TDD: reload_test.go (incl. -race concurrency), adcand_test.go,
test_ad_event_candidates.py, test_autolearn_socialfeed.py. Go build (offline
arm64 + darwin), vet, go test -race all green.
2026-06-19 10:36:02 +02:00
12 changed files with 938 additions and 35 deletions

View File

@ -0,0 +1,147 @@
// SPDX-License-Identifier: LicenseRef-CMSD-1.0
// Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
//
// SecuBox-Deb :: toolbox-ng :: ad-candidate learning-feed tests (#662)
//
// The Go cutover blocked from STATIC lists but never emitted LEARNING
// candidates, so a brand-new adware (acotedemoi.com) was never observed → never
// promoted → slipped through forever. These tests prove the engine now ports
// ad_ghost's _AD_PATH heuristic and records a candidate (host,site) for every
// 3rd-party ad-path request on the allow/mitm path — the feed autolearn promotes.
package main
import (
"path/filepath"
"testing"
)
func TestAdPathRegex(t *testing.T) {
hit := []string{
"/ad/1.gif", "/ads/x", "/adserver/req", "/pagead/conversion",
"/gampad/ads", "/doubleclick/x", "/beacon", "/pixel.gif",
"/collect", "/track", "/tracking/p", "/telemetry/v2", "/metric",
"/PAGEAD/Upper", // case-insensitive
}
for _, p := range hit {
if !adPathRE.MatchString(p) {
t.Errorf("adPathRE should MATCH %q", p)
}
}
miss := []string{"/", "/index.html", "/api/users", "/static/app.js", "/cart", "/headline"}
for _, p := range miss {
if adPathRE.MatchString(p) {
t.Errorf("adPathRE should NOT match %q", p)
}
}
}
// newAdCandTestPolicy builds a Policy with doubleclick.net allowlisted (so the
// allowlist-skip branch is exercised) and nothing learned.
func newAdCandTestPolicy(t *testing.T) *Policy {
t.Helper()
pol, err := LoadPolicy(PolicyOpts{
AllowPath: writeTemp(t, "doubleclick.net\n"),
LearnedPath: writeTemp(t, ""),
SpliceSeedPath: writeTemp(t, ""),
SpliceLearnPath: writeTemp(t, ""),
PureTrackersPath: writeTemp(t, ""),
SelfDomains: []string{"secubox.in"},
})
if err != nil {
t.Fatalf("LoadPolicy: %v", err)
}
return pol
}
func TestMaybeRecordAdCandidate(t *testing.T) {
pol := newAdCandTestPolicy(t)
cases := []struct {
name string
host string // request host
site string // referer site (registrable)
path string
want bool // candidate recorded?
wantHK string
}{
{"3rd-party ad-path → candidate", "metrics.acotedemoi.com", "lemonde.fr", "/collect", true, "metrics.acotedemoi.com"},
{"3rd-party ad-path /pagead", "ads.foo.io", "news.example", "/pagead/x", true, "ads.foo.io"},
{"1st-party (same registrable) → no candidate", "static.lemonde.fr", "lemonde.fr", "/ads/x", false, ""},
{"3rd-party non-ad-path → no candidate", "cdn.acotedemoi.com", "lemonde.fr", "/app.js", false, ""},
{"no site (no Referer) → no candidate", "metrics.acotedemoi.com", "", "/collect", false, ""},
{"allowlisted host → no candidate", "ads.doubleclick.net", "lemonde.fr", "/pagead/x", false, ""},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
cand := newAdCandidates()
px := &Proxy{pol: pol, cand: cand, analysisRelay: true}
px.maybeRecordAdCandidate(tc.host, tc.site, tc.path)
snap := cand.snapshot()
if tc.want {
if len(snap) != 1 {
t.Fatalf("want 1 candidate, got %d (%+v)", len(snap), snap)
}
if snap[0].Host != tc.wantHK {
t.Fatalf("candidate host = %q, want %q", snap[0].Host, tc.wantHK)
}
if snap[0].Site != tc.site {
t.Fatalf("candidate site = %q, want %q", snap[0].Site, tc.site)
}
if snap[0].Hits != 1 {
t.Fatalf("candidate hits = %d, want 1", snap[0].Hits)
}
} else if len(snap) != 0 {
t.Fatalf("want 0 candidates, got %d (%+v)", len(snap), snap)
}
})
}
}
// TestAdCandidateGatedByRelay proves the feed is gated behind the analysis/ad
// relay flag: with the gate off, nothing is recorded even on a textbook hit.
func TestAdCandidateGatedByRelay(t *testing.T) {
pol := newAdCandTestPolicy(t)
cand := newAdCandidates()
px := &Proxy{pol: pol, cand: cand, analysisRelay: false}
px.maybeRecordAdCandidate("metrics.acotedemoi.com", "lemonde.fr", "/collect")
if n := len(cand.snapshot()); n != 0 {
t.Fatalf("relay off: want 0 candidates, got %d", n)
}
}
// TestAdCandidateHitsAccumulate proves repeated (host,site) hits coalesce.
func TestAdCandidateHitsAccumulate(t *testing.T) {
cand := newAdCandidates()
for i := 0; i < 5; i++ {
cand.record("x.tracker.io", "site.example")
}
snap := cand.snapshot()
if len(snap) != 1 || snap[0].Hits != 5 {
t.Fatalf("want 1 row hits=5, got %+v", snap)
}
// snapshot clears.
if n := len(cand.snapshot()); n != 0 {
t.Fatalf("snapshot should clear: got %d", n)
}
}
// TestAdCandidatePayloadShape proves the candidates list serialises into the
// extended ad-event payload (host/site/hits keys).
func TestAdCandidatePayloadShape(t *testing.T) {
cand := newAdCandidates()
cand.record("x.tracker.io", "site.example")
rows := cand.snapshot()
p := adEventPayload{Candidates: rows}
if p.empty() {
t.Fatal("payload with candidates must not be empty()")
}
}
// writeTemp writes content to a fresh temp file and returns its path.
func writeTemp(t *testing.T, content string) string {
t.Helper()
f := filepath.Join(t.TempDir(), "list.txt")
writeFile(t, f, content)
return f
}

View File

@ -26,10 +26,74 @@ import (
"log"
"net/http"
"net/url"
"regexp"
"sync"
"time"
)
// ── ad-candidate learning feed (#662 auto-learn loop) ─────────────────────────
//
// The STATIC block list never grows on its own; ad_ghost fed autolearn by
// capturing CANDIDATES — 3rd-party requests whose PATH smells like an ad/track
// endpoint — into ad_candidates, which secubox-toolbox-autolearn later promotes
// into learned-trackers.txt at AD_MIN_SITES distinct sites. The Go cutover
// dropped this feed, so new adwares (acotedemoi.com) were never observed. This
// restores it in the engine: the allow/mitm hot path records (host,site) when
// the request is 3rd-party AND adPathRE matches, buffered + flushed with the
// existing ad-event machinery.
// adPathRE ports ad_ghost._AD_PATH (RE2-safe, case-insensitive). Matches a path
// that looks like an ad/track endpoint. Learning only — never a block decision.
//
// Python: re.compile(r"/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|"
// r"/pixel|/collect|/track(ing)?|/telemetry|/metric", re.I)
var adPathRE = regexp.MustCompile(`(?i)/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|/pixel|/collect|/track(ing)?|/telemetry|/metric`)
// adCandMapCap bounds the candidate buffer (mirrors ad_ghost's `len(_cand) <
// 20000` guard): NEW keys past the cap are dropped until the next flush clears
// it, so a dead portal can never grow memory unbounded.
const adCandMapCap = 20000
// adCandidates is the lock-guarded (host,site)→hits candidate aggregator,
// drained by the ad-stats flusher into the ad-event payload's "candidates" list.
type adCandidates struct {
mu sync.Mutex
hit map[adKey]int64
}
func newAdCandidates() *adCandidates { return &adCandidates{hit: map[adKey]int64{}} }
// record tallies one ad-candidate (host,site). O(1); the cap drops only NEW keys
// (existing keys keep accumulating). Empty host is ignored.
func (a *adCandidates) record(host, site string) {
if host == "" {
return
}
a.mu.Lock()
defer a.mu.Unlock()
k := adKey{adHost: host, site: site}
if _, ok := a.hit[k]; ok {
a.hit[k]++
} else if len(a.hit) < adCandMapCap {
a.hit[k] = 1
}
}
// snapshot atomically reads-and-clears the buffer, returning the candidate rows.
func (a *adCandidates) snapshot() []adCandidateRow {
a.mu.Lock()
defer a.mu.Unlock()
if len(a.hit) == 0 {
return nil
}
rows := make([]adCandidateRow, 0, len(a.hit))
for k, n := range a.hit {
rows = append(rows, adCandidateRow{Host: k.adHost, Site: k.site, Hits: n})
}
a.hit = map[adKey]int64{}
return rows
}
// refererSite ports the ad_ghost _site_of logic: parse the Referer header as a
// URL, take its hostname, and return registrable(hostname). Empty Referer or a
// parse failure → "" (the page that issued the blocked request is unknown).
@ -133,9 +197,19 @@ type adClientRow struct {
Bytes int64 `json:"bytes"`
}
// adCandidateRow is one learning candidate (host seen issuing ad-path requests
// from a 1st-party site). Mirrors the portal /__toolbox/ad-event "candidates"
// contract → store.record_ad_candidates([(host, site, hits), ...]).
type adCandidateRow struct {
Host string `json:"host"`
Site string `json:"site"`
Hits int64 `json:"hits"`
}
type adEventPayload struct {
Blocks []adBlockRow `json:"blocks"`
Clients []adClientRow `json:"clients"`
Candidates []adCandidateRow `json:"candidates,omitempty"`
}
// snapshot atomically reads-and-clears both maps, returning the accumulated rows.
@ -159,7 +233,9 @@ func (a *adStats) snapshot() adEventPayload {
}
// empty reports whether a payload carries no rows (nothing to POST).
func (p adEventPayload) empty() bool { return len(p.Blocks) == 0 && len(p.Clients) == 0 }
func (p adEventPayload) empty() bool {
return len(p.Blocks) == 0 && len(p.Clients) == 0 && len(p.Candidates) == 0
}
// adEventClient is a short-timeout fire-and-forget client for the ad-event POST.
// Sibling of portalClient (banner.go): the portal is a fixed loopback base, so
@ -175,8 +251,15 @@ var adEventClient = &http.Client{
// non-2xx) is swallowed with at most a debug log — the metrics are stats, not
// security, and the engine must never block on the portal. Exposed (returns the
// flushed payload) so the test can assert the snapshot/clear + payload shape.
func (a *adStats) flushOnce(portal string) adEventPayload {
//
// cand may be nil (the CONNECT PoC / tests with no learning feed); when set its
// candidate rows are drained into the SAME payload so the learning feed rides
// the existing ad-event channel (one POST per 10s, not two).
func (a *adStats) flushOnce(portal string, cand *adCandidates) adEventPayload {
p := a.snapshot()
if cand != nil {
p.Candidates = cand.snapshot()
}
if p.empty() {
return p
}
@ -198,10 +281,10 @@ func (a *adStats) flushOnce(portal string) adEventPayload {
// runAdStatsFlusher is the background flusher goroutine: every adFlushInterval it
// drains the aggregator to the portal. Start it once from main() (like the
// engine's other startup goroutines). It runs forever (the process lifetime).
func (a *adStats) runAdStatsFlusher(portal string) {
func (a *adStats) runAdStatsFlusher(portal string, cand *adCandidates) {
t := time.NewTicker(adFlushInterval)
defer t.Stop()
for range t.C {
a.flushOnce(portal)
a.flushOnce(portal, cand)
}
}

View File

@ -111,7 +111,7 @@ func TestFlushOncePayloadShapeMatchesContract(t *testing.T) {
}))
defer srv.Close()
a.flushOnce(srv.URL)
a.flushOnce(srv.URL, nil)
if ct != "application/json" {
t.Fatalf("Content-Type = %q, want application/json", ct)
@ -145,7 +145,7 @@ func TestFlushOnceEmptySkipsPost(t *testing.T) {
w.WriteHeader(http.StatusNoContent)
}))
defer srv.Close()
a.flushOnce(srv.URL)
a.flushOnce(srv.URL, nil)
if posted {
t.Fatalf("flushOnce on empty aggregator must not POST")
}
@ -156,7 +156,7 @@ func TestFlushOnceSwallowsPortalError(t *testing.T) {
a.recordAdBlock("ads.example.com", "site", "")
// Unreachable portal → must not panic, must still clear the maps (snapshot
// happens before the POST).
a.flushOnce("http://127.0.0.1:1")
a.flushOnce("http://127.0.0.1:1", nil)
if len(a.blocks) != 0 {
t.Fatalf("flushOnce must clear maps even on POST failure")
}

View File

@ -204,6 +204,7 @@ type Proxy struct {
poison bool // master gate: poison tracker Set-Cookies (default on when jarKey present)
portal string // portal base URL for /__toolbox/* reverse-proxy (banner assets)
ads *adStats // #662 — ad-block metrics aggregator (flushed to the portal)
cand *adCandidates // #662 — ad-candidate learning feed (flushed with ads to the portal)
cspDemo bool // #662 CONSENTED-DEMONSTRATION: relax a page's CSP so the injected loader runs, and flag the bypass (data-csp=1 → 🔓). Default on.
// analysisRelay gates the per-flow telemetry relay to the dpi/cookies/ja4
@ -229,6 +230,33 @@ func (px *Proxy) recordAdBlock(adHost, site, macHash string) {
}
}
// maybeRecordAdCandidate feeds the auto-learn loop (#662): on the allow/mitm
// path (NOT block — already caught; NOT allowlisted/own-infra), it records an
// ad-candidate (host, site) when the request is 3rd-party
// (registrable(host) != registrable(site)) AND the path smells like an ad/track
// endpoint (adPathRE). It is the engine port of ad_ghost's candidate capture —
// the feed secubox-toolbox-autolearn promotes into learned-trackers.txt at
// AD_MIN_SITES distinct sites. Gated behind the analysis/ad relay flag, O(1) hot
// path, fire-and-forget, nil-safe (CONNECT PoC / tests with no feed).
func (px *Proxy) maybeRecordAdCandidate(host, site, path string) {
if px == nil || px.cand == nil || !px.relayEnabled() || px.pol == nil {
return
}
if site == "" || host == "" {
return // no 1st-party context (no Referer) → nothing to attribute.
}
if px.pol.allowedSafe(host) {
return // own-infra / allowlist: never learn our own / trusted hosts.
}
if registrable(host) == registrable(site) {
return // 1st-party request: not a cross-site ad/track signal.
}
if !adPathRE.MatchString(path) {
return // path doesn't look like an ad/track endpoint.
}
px.cand.record(host, site)
}
func (px *Proxy) serverTLSConfig() *tls.Config {
return px.serverTLSConfigCapture(nil)
}
@ -414,6 +442,15 @@ func (px *Proxy) mitmPipeline(tconn *tls.Conn, rawClient net.Conn, host, verdict
relayIP := peerIP(rawClient)
px.emitDPI(relayIP, clientHash, host, req)
// #662 — feed the auto-learn loop: on this allow/mitm flow, record an
// ad-candidate when the request is 3rd-party AND its path smells like an
// ad/track endpoint (ad_ghost's _AD_PATH heuristic). site = registrable of
// the Referer (the ad_ghost _site_of flavour). Done BEFORE anonymize mutates
// headers (so the Referer is the client's original). O(1), gated,
// fire-and-forget — a new adware host gets observed here, promoted by
// autolearn, then blocked+smogged after the policy live-reloads it.
px.maybeRecordAdCandidate(host, refererSite(req.Header.Get("Referer")), req.URL.Path)
anonymizeRequest(req.Header)
// #662 — do NOT touch Accept-Encoding. We FORWARD the client's original
@ -569,6 +606,7 @@ func main() {
poison: *poison,
portal: *portal,
ads: newAdStats(),
cand: newAdCandidates(),
cspDemo: *cspDemo,
analysisRelay: *analysisRelay,
@ -585,7 +623,9 @@ func main() {
// #662 — start the ad-block metrics flusher: the block path tallies every
// 204 into px.ads, drained every 10s to the portal's /__toolbox/ad-event
// (best-effort, fire-and-forget) so the #ads dashboard sees blocks again.
go px.ads.runAdStatsFlusher(*portal)
// #662 — the candidate feed (px.cand) is drained in the SAME flush so the
// learning candidates ride the existing ad-event channel (one POST / 10s).
go px.ads.runAdStatsFlusher(*portal, px.cand)
if *transparent {
// Transparent R3 mode: raw accept loop, each conn carries its pre-DNAT
// destination via SO_ORIGINAL_DST (recovered in handleTransparent). The

View File

@ -17,6 +17,8 @@ import (
"os"
"regexp"
"strings"
"sync"
"time"
)
// ── ad_ghost: static ad/tracker host pattern (port of _AD_HOST) ──────────────
@ -95,6 +97,11 @@ func envOr(key, def string) string {
// Policy carries the loaded sets/regex and decides per-host actions. It also
// keeps the legacy PoC fields (Inject) so the existing wiring/tests still work.
type Policy struct {
// mu guards the live-reloadable map fields below. Decide/allowed/blockedByAd/
// shouldSplice take RLock; maybeReload takes Lock only when a backing file
// actually changed (the throttle + stat happen under a separate lighter lock).
mu sync.RWMutex
adHost *regexp.Regexp
learned map[string]bool // learned-trackers (host or registrable, lowercased)
allow map[string]bool // ad-allowlist (host or registrable, lowercased)
@ -104,10 +111,41 @@ type Policy struct {
selfRegs map[string]bool // own-infra registrable domains
selfDomains []string // own-infra (for the host==d || host endswith .d guard)
// ── live-reload state (#662 auto-learn loop) ─────────────────────────────
//
// The lists are loaded once at startup, then re-read on-disk when their
// mtime changes so autolearn promotions / manual edits take effect WITHOUT a
// worker restart (mirrors ad_ghost._maybe_reload). The hot path (Decide)
// calls maybeReload(): a throttle check, then — at most every reloadThrottle —
// a cheap stat() of each backing file. Only a changed file is re-read and its
// map atomically swapped under mu.
reloadFiles []reloadTarget // backing files + their swap target
fortknoxSites []string // kept for rebuilding the never-set on pure-trackers reload
reloadMu sync.Mutex // guards lastReloadCheck + the per-file mtimes
lastReloadID int64 // unix-nano of the last throttle pass (0 = never)
reloadThrottle time.Duration // min interval between stat passes (0 in tests = eager)
// Legacy PoC fields kept so non-policy behaviour is unchanged.
Inject []byte // banner / ad-CSS marker injected before </head> or </body>
}
// reloadTarget describes one backing file the engine live-reloads: its path, the
// last mtime we read, whether comment-stripping applies (loadLines vs
// loadLinesRaw), and an applier that swaps the freshly-read set into the right
// Policy field (under p.mu, held by the caller). pure-trackers re-derives the
// never-set ( fortknox) so it stays consistent.
type reloadTarget struct {
path string
stripComm bool
lastMtime int64
apply func(p *Policy, set map[string]bool)
}
// defaultReloadThrottle is the production stat cadence: a backing-file change
// (autolearn runs hourly; a promotion is rare) is observed within ~15s, and the
// hot path stats at most ~4×/minute regardless of request rate.
const defaultReloadThrottle = 15 * time.Second
// loadLines mirrors the comment-stripping Python loaders (splice._load_lines,
// ad_ghost._allowed's allowlist read): split on first '#', trim, lowercase,
// skip blanks. Missing/unreadable file → empty set (best-effort).
@ -196,7 +234,7 @@ func LoadPolicy(opts PolicyOpts) (*Policy, error) {
selfDomains = append(selfDomains, d)
}
return &Policy{
p := &Policy{
adHost: re,
learned: loadLinesRaw(opts.LearnedPath), // mirrors _learned_set (no comment-strip)
allow: loadLines(opts.AllowPath),
@ -205,7 +243,98 @@ func LoadPolicy(opts PolicyOpts) (*Policy, error) {
never: never,
selfRegs: selfRegs,
selfDomains: selfDomains,
}, nil
fortknoxSites: append([]string(nil), opts.FortknoxSites...),
reloadThrottle: defaultReloadThrottle,
}
// ── register the live-reloadable backing files (#662 auto-learn loop) ─────
//
// Each entry re-reads its file when its mtime changes and atomically swaps
// the map under p.mu (held by maybeReload). learned-trackers + ad-allowlist
// are the load-bearing pair (autolearn promotes into learned; the operator
// edits the allowlist); the splice seed/learned + pure-trackers files are
// reloaded too for consistency (pure-trackers re-derives the never-set).
p.reloadFiles = []reloadTarget{
{path: opts.LearnedPath, stripComm: false, lastMtime: statMtime(opts.LearnedPath),
apply: func(p *Policy, s map[string]bool) { p.learned = s }},
{path: opts.AllowPath, stripComm: true, lastMtime: statMtime(opts.AllowPath),
apply: func(p *Policy, s map[string]bool) { p.allow = s }},
{path: opts.SpliceSeedPath, stripComm: true, lastMtime: statMtime(opts.SpliceSeedPath),
apply: func(p *Policy, s map[string]bool) { p.spliceSeed = s }},
{path: opts.SpliceLearnPath, stripComm: true, lastMtime: statMtime(opts.SpliceLearnPath),
apply: func(p *Policy, s map[string]bool) { p.spliceLearn = s }},
{path: opts.PureTrackersPath, stripComm: true, lastMtime: statMtime(opts.PureTrackersPath),
apply: func(p *Policy, s map[string]bool) {
// pure-trackers fortknox → never-set (mirrors LoadPolicy above).
for _, fk := range p.fortknoxSites {
if fk = strings.Trim(strings.ToLower(strings.TrimSpace(fk)), "."); fk != "" {
s[fk] = true
}
}
p.never = s
}},
}
return p, nil
}
// statMtime returns the file's mtime in unix-nano, or 0 when the file is missing
// or unreadable (best-effort, like the Python loaders: a missing file → empty
// set, mtime 0). A file appearing/disappearing therefore registers as a change.
func statMtime(path string) int64 {
if path == "" {
return 0
}
fi, err := os.Stat(path)
if err != nil {
return 0
}
return fi.ModTime().UnixNano()
}
// maybeReload re-reads any backing list whose on-disk mtime changed since the
// last pass, swapping the affected map(s) under p.mu. Throttled to at most one
// stat pass per p.reloadThrottle (cheap: a time compare + a few stats), so the
// Decide hot path pays almost nothing. Concurrency-safe: the throttle/mtime
// bookkeeping is under reloadMu and the map swap under mu — Decide's readers
// hold mu.RLock, so a swap is atomic w.r.t. any in-flight decision.
func (p *Policy) maybeReload() {
now := time.Now()
p.reloadMu.Lock()
if p.reloadThrottle > 0 && p.lastReloadID != 0 &&
now.Sub(time.Unix(0, p.lastReloadID)) < p.reloadThrottle {
p.reloadMu.Unlock()
return
}
p.lastReloadID = now.UnixNano()
// Collect the files that changed (stat under reloadMu; re-read outside mu).
type pending struct {
idx int
set map[string]bool
}
var changed []pending
for i := range p.reloadFiles {
rt := &p.reloadFiles[i]
if rt.path == "" {
continue
}
m := statMtime(rt.path)
if m != rt.lastMtime {
rt.lastMtime = m
changed = append(changed, pending{idx: i, set: scanLines(rt.path, rt.stripComm)})
}
}
p.reloadMu.Unlock()
if len(changed) == 0 {
return
}
// Swap the affected maps atomically under the write lock.
p.mu.Lock()
for _, c := range changed {
p.reloadFiles[c.idx].apply(p, c.set)
}
p.mu.Unlock()
}
// ── registrable: port of ad_ghost._registrable ───────────────────────────────
@ -279,6 +408,11 @@ func hostMatches(host string, patterns map[string]bool) bool {
// allowed: port of ad_ghost._allowed. Own-infra ALWAYS wins (reflash-safe),
// then the operator allowlist (host or registrable).
//
// LOCK CONTRACT: reads the reloadable allow map — the caller MUST hold at least
// p.mu.RLock (Decide / shouldPoison do). Lock-free internally so Decide can call
// it alongside shouldSplice/blockedByAd under a single RLock (sync.RWMutex is
// not reentrant).
func (p *Policy) allowed(host string) bool {
h := strings.ToLower(host)
reg := registrable(h)
@ -297,7 +431,19 @@ func (p *Policy) allowed(host string) bool {
return p.allow[h] || p.allow[reg]
}
// allowedSafe is the lock-taking entry point to allowed() for callers OUTSIDE a
// Decide RLock (e.g. the ad-candidate feed). It also picks up a live-reloaded
// allowlist via maybeReload, so a freshly-allowlisted host stops being learned.
func (p *Policy) allowedSafe(host string) bool {
p.maybeReload()
p.mu.RLock()
defer p.mu.RUnlock()
return p.allowed(host)
}
// shouldSplice: port of splice.should_splice (never wins; then seed learned).
// LOCK CONTRACT: reads the reloadable never/spliceSeed/spliceLearn maps — the
// caller MUST hold at least p.mu.RLock (Decide does).
func (p *Policy) shouldSplice(sni string) bool {
s := strings.Trim(strings.ToLower(sni), ".")
if s == "" {
@ -312,6 +458,10 @@ func (p *Policy) shouldSplice(sni string) bool {
// blockedByAd: port of the ad_ghost requestheaders block decision (sans the
// allowlist guard, which Decide applies first): _AD_HOST match OR
// registrable/host in learned-trackers.
//
// LOCK CONTRACT: reads the reloadable learned map — the caller MUST hold at
// least p.mu.RLock. Decide and shouldPoison (via isTracker) do; the candidate-
// emit path calls it only through those.
func (p *Policy) blockedByAd(host string) bool {
if p.adHost.MatchString(host) {
return true
@ -339,9 +489,16 @@ func (p *Policy) blockedByAd(host string) bool {
// sni defaults to host when empty (the live engine splices on SNI == the TLS
// host; for the parity harness host and sni are the same value).
func (p *Policy) Decide(host, sni string) string {
// #662 — pick up autolearn promotions / manual edits without a worker
// restart. Throttled to ~every reloadThrottle and best-effort, so the hot
// path normally pays only a time compare. Done BEFORE taking the read lock
// (maybeReload may take the write lock to swap a changed map).
p.maybeReload()
if sni == "" {
sni = host
}
p.mu.RLock()
defer p.mu.RUnlock()
if p.allowed(host) {
return "allow"
}

View File

@ -148,6 +148,12 @@ func (p *Policy) isTracker(host string) bool {
// allowlisted — own-infra flows are left clean (same dark safety as the block
// path). The caller additionally requires a loaded jar key.
func (p *Policy) shouldPoison(host string) bool {
// #662 — consult the same live-reloaded learned set Decide uses, so a host
// promoted into learned-trackers (by autolearn) is poisoned (smogged), not
// only 204'd, without a worker restart. RLock-guard the reloadable maps
// (allowed + isTracker→blockedByAd read them); maybeReload may swap them.
p.mu.RLock()
defer p.mu.RUnlock()
if p.allowed(host) {
return false // own-infra / allowlist → never poison
}

View File

@ -0,0 +1,189 @@
// SPDX-License-Identifier: LicenseRef-CMSD-1.0
// Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
//
// SecuBox-Deb :: toolbox-ng :: policy live-reload tests (#662 auto-learn loop)
//
// The #662 Go cutover loaded the BLOCK/SPLICE lists ONCE at startup, so an
// autolearn promotion (or a manual edit) of learned-trackers.txt never took
// effect until a worker restart — the very thing that made new adwares slip
// through forever. These tests prove the mtime-based live-reload: after the
// throttle window, a host appended to learned-trackers.txt flips Decide from
// "mitm" to "block" with NO restart. Concurrency is exercised under -race.
package main
import (
"os"
"path/filepath"
"sync"
"sync/atomic"
"testing"
"time"
)
// writeFile is a tiny helper that (re)writes a backing list file with content.
func writeFile(t *testing.T, path, content string) {
t.Helper()
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatalf("write %s: %v", path, err)
}
}
// bumpMtime forces the file's mtime forward so the reload's stat sees a change
// even on coarse-granularity filesystems or sub-second test runs.
func bumpMtime(t *testing.T, path string, d time.Duration) {
t.Helper()
ft := time.Now().Add(d)
if err := os.Chtimes(path, ft, ft); err != nil {
t.Fatalf("chtimes %s: %v", path, err)
}
}
// TestMaybeReloadPicksUpAppendedLearnedTracker is the linchpin test: a host that
// initially Decides "mitm" must flip to "block" once it is appended to
// learned-trackers.txt and the throttle window elapses — without reloading the
// Policy from scratch.
func TestMaybeReloadPicksUpAppendedLearnedTracker(t *testing.T) {
dir := t.TempDir()
learned := filepath.Join(dir, "learned-trackers.txt")
allow := filepath.Join(dir, "ad-allowlist.txt")
writeFile(t, learned, "")
writeFile(t, allow, "")
pol, err := LoadPolicy(PolicyOpts{
LearnedPath: learned,
AllowPath: allow,
// keep the splice/never paths in the temp dir so missing-file behaviour
// (empty set) is deterministic.
SpliceSeedPath: filepath.Join(dir, "seed"),
SpliceLearnPath: filepath.Join(dir, "slearn"),
PureTrackersPath: filepath.Join(dir, "pure"),
SelfDomains: []string{"secubox.in"},
})
if err != nil {
t.Fatalf("LoadPolicy: %v", err)
}
// Make the reload eager for the test (no 15s wait): zero throttle.
pol.reloadThrottle = 0
const host = "acotedemoi.com"
if got := pol.Decide(host, host); got != "mitm" {
t.Fatalf("before promotion: Decide(%q) = %q, want mitm", host, got)
}
// Promote: append the host and bump mtime forward.
writeFile(t, learned, host+"\n")
bumpMtime(t, learned, 2*time.Second)
if got := pol.Decide(host, host); got != "block" {
t.Fatalf("after promotion: Decide(%q) = %q, want block", host, got)
}
}
// TestMaybeReloadThrottled proves the throttle: with a non-zero throttle window,
// a change made just after a reload is NOT observed until the window elapses,
// keeping the hot path cheap (one stat per ~window, not per request).
func TestMaybeReloadThrottled(t *testing.T) {
dir := t.TempDir()
learned := filepath.Join(dir, "learned-trackers.txt")
writeFile(t, learned, "")
pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: filepath.Join(dir, "allow")})
if err != nil {
t.Fatalf("LoadPolicy: %v", err)
}
pol.reloadThrottle = time.Hour // effectively "never re-stat during the test"
// Prime the throttle clock with one Decide (does the initial stat).
_ = pol.Decide("x.example", "x.example")
const host = "tracker.example"
writeFile(t, learned, host+"\n")
bumpMtime(t, learned, 2*time.Second)
if got := pol.Decide(host, host); got != "mitm" {
t.Fatalf("throttled: Decide(%q) = %q, want mitm (change not yet observed)", host, got)
}
}
// TestMaybeReloadAllowlist proves the allowlist file is live-reloaded too: a
// host the ad-host regex would block ("doubleclick.net") flips block→allow once
// appended to the allowlist and the window elapses.
func TestMaybeReloadAllowlist(t *testing.T) {
dir := t.TempDir()
learned := filepath.Join(dir, "learned-trackers.txt")
allow := filepath.Join(dir, "ad-allowlist.txt")
writeFile(t, learned, "")
writeFile(t, allow, "")
pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: allow})
if err != nil {
t.Fatalf("LoadPolicy: %v", err)
}
pol.reloadThrottle = 0
const host = "doubleclick.net"
if got := pol.Decide(host, host); got != "block" {
t.Fatalf("before allow: Decide(%q) = %q, want block", host, got)
}
writeFile(t, allow, host+"\n")
bumpMtime(t, allow, 2*time.Second)
if got := pol.Decide(host, host); got != "allow" {
t.Fatalf("after allow: Decide(%q) = %q, want allow", host, got)
}
}
// TestMaybeReloadConcurrent runs Decide from many goroutines while the backing
// learned file is rewritten concurrently. Under `go test -race` this proves the
// RWMutex-guarded swap is data-race-free.
func TestMaybeReloadConcurrent(t *testing.T) {
dir := t.TempDir()
learned := filepath.Join(dir, "learned-trackers.txt")
writeFile(t, learned, "seed.example\n")
pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: filepath.Join(dir, "allow")})
if err != nil {
t.Fatalf("LoadPolicy: %v", err)
}
pol.reloadThrottle = 0 // force a stat on every Decide → maximal contention
var wg sync.WaitGroup
var blocks int64
stop := make(chan struct{})
// Writer: keep appending hosts + bumping mtime.
wg.Add(1)
go func() {
defer wg.Done()
i := 0
for {
select {
case <-stop:
return
default:
}
writeFile(t, learned, "seed.example\nh"+itoa(i)+".example\n")
bumpMtime(t, learned, time.Duration(i+1)*time.Second)
i++
}
}()
// Readers: hammer Decide on the seed (stable → always block) + a live host.
for r := 0; r < 8; r++ {
wg.Add(1)
go func() {
defer wg.Done()
for j := 0; j < 2000; j++ {
if pol.Decide("seed.example", "seed.example") == "block" {
atomic.AddInt64(&blocks, 1)
}
pol.Decide("h0.example", "h0.example")
}
}()
}
time.Sleep(50 * time.Millisecond)
close(stop)
wg.Wait()
if blocks == 0 {
t.Fatal("expected the stable seed host to block at least once")
}
}

View File

@ -1,3 +1,12 @@
secubox-toolbox-ng (0.1.12-1~bookworm1) bookworm; urgency=medium
* adlearn: live-reload the blocklist (mtime) so promotions/edits block without
a worker restart; emit ad-candidates (3rd-party ad-path) to the portal;
autolearn also promotes cross-site trackers from social_edges. Learned
trackers are auto-204 + poison-smogged. (ref #662)
-- Gerald KERMA <devel@cybermind.fr> Thu, 19 Jun 2026 12:30:00 +0000
secubox-toolbox-ng (0.1.11-1~bookworm1) bookworm; urgency=medium
* social: ALSO correlate on the block path — blocked 3rd-party trackers still

View File

@ -221,6 +221,92 @@ def _ad_feed() -> int:
return len(promoted)
# #662 — cross-site-reuse promotion. A tracker_domain seen issuing cookies on
# >= SOCIAL_MIN_SITES DISTINCT src_site (across peers, recent window) is a
# BEHAVIOURALLY-confirmed cross-site tracker (the social graph), independent of
# the ad-path heuristic. Promote it into learned-trackers.txt so the engine
# blocks (204) + smogs it. Conservative + reuses the SAME allowlist/self guard as
# _ad_feed (NEVER promote allowlisted or self domains). De-dups against OUT.
SOCIAL_MIN_SITES = int(os.environ.get("SECUBOX_SOCIAL_MIN_SITES", "3"))
SOCIAL_WINDOW_HOURS = int(os.environ.get("SECUBOX_SOCIAL_WINDOW_HOURS", "168"))
def _social_feed() -> int:
"""Promote cross-site cookie-reuse trackers (social_edges) into the learned
blocklist. A tracker_domain linking >= SOCIAL_MIN_SITES distinct src_site in
the last SOCIAL_WINDOW_HOURS is promoted. Allowlist + self domains excluded
(reused guard). MERGES into OUT (never overwrites). Returns count promoted, or
-1 if unavailable (e.g. no social_edges table). Best-effort: never raises."""
cutoff = int(time.time()) - SOCIAL_WINDOW_HOURS * 3600
try:
con = sqlite3.connect(DB, timeout=5)
rows = con.execute(
"SELECT tracker_domain, COUNT(DISTINCT src_site) AS sites "
"FROM social_edges WHERE ts >= ? "
"GROUP BY tracker_domain", (cutoff,)).fetchall()
con.close()
except Exception as e:
sys.stderr.write(f"autolearn: social query failed: {e}\n")
return -1
# Fold to registrable and aggregate the distinct-site count per eTLD+1 (two
# tracker subdomains of the same registrable jointly meet the threshold).
by_reg: dict[str, set] = {}
try:
scon = sqlite3.connect(DB, timeout=5)
for td, _sites in rows:
reg = registrable(td)
if not reg:
continue
ss = by_reg.setdefault(reg, set())
for (s,) in scon.execute(
"SELECT DISTINCT src_site FROM social_edges "
"WHERE ts >= ? AND tracker_domain = ?", (cutoff, td)):
if s:
ss.add(s)
scon.close()
except Exception as e:
sys.stderr.write(f"autolearn: social fold failed: {e}\n")
return -1
allow = _load_ad_allowlist()
self_doms = {d.strip().lower() for d in
os.environ.get("SECUBOX_SELF_DOMAINS", "secubox.in").split(",")
if d.strip()}
promoted: set = set()
for reg, sites in by_reg.items():
if len(sites) < SOCIAL_MIN_SITES:
continue
if reg in allow:
continue
if reg in self_doms or any(reg == d or reg.endswith("." + d) for d in self_doms):
continue
promoted.add(reg)
if not promoted:
return 0
existing: set = set()
try:
if os.path.exists(OUT):
with open(OUT, encoding="utf-8") as fh:
for ln in fh:
ln = ln.strip()
if ln:
existing.add(ln)
except Exception as e:
sys.stderr.write(f"autolearn: social merge read failed: {e}\n")
new = promoted - existing
merged = sorted(existing | promoted)[:MAX_ENTRIES]
try:
os.makedirs(os.path.dirname(OUT), exist_ok=True)
tmp = OUT + ".tmp"
with open(tmp, "w", encoding="utf-8") as fh:
fh.write("\n".join(merged) + ("\n" if merged else ""))
os.replace(tmp, OUT)
except Exception as e:
sys.stderr.write(f"autolearn: social write failed: {e}\n")
return -1
return len(new)
def main() -> int:
learned: set[str] = set()
try:
@ -317,6 +403,11 @@ def main() -> int:
sys.stderr.write(f"autolearn: {_n_ad} ad-candidate hosts promoted\n")
except Exception as e:
sys.stderr.write(f"autolearn: ad feed error: {e}\n")
try:
_n_social = _social_feed()
sys.stderr.write(f"autolearn: {_n_social} cross-site cookie trackers promoted\n")
except Exception as e:
sys.stderr.write(f"autolearn: social feed error: {e}\n")
sys.stderr.write(
f"autolearn: {len(out)} hosts learned ({ti} threat-intel + "
f"{len(out) - ti} classified cross-site) @ {int(time.time())}"

View File

@ -113,12 +113,20 @@ async def toolbox_ad_event(request: Request) -> Response:
return Response(status_code=204)
blocks = body.get("blocks") or []
clients = body.get("clients") or []
# #662 — the Go engine now also feeds the AUTO-LEARN loop: 3rd-party
# ad-path requests it saw on the allow/mitm path (ad_ghost's _AD_PATH
# heuristic), recorded as candidates here for secubox-toolbox-autolearn
# to promote into learned-trackers.txt at AD_MIN_SITES distinct sites.
candidates = body.get("candidates") or []
if not isinstance(blocks, list):
blocks = []
if not isinstance(clients, list):
clients = []
if not isinstance(candidates, list):
candidates = []
blocks = blocks[:_AD_EVENT_ROW_CAP]
clients = clients[:_AD_EVENT_ROW_CAP]
candidates = candidates[:_AD_EVENT_ROW_CAP]
block_rows = [
(b["ad_host"], b.get("site", ""), "block", int(b.get("hits", 0)), int(b.get("bytes", 0)))
@ -130,10 +138,17 @@ async def toolbox_ad_event(request: Request) -> Response:
for c in clients
if isinstance(c, dict) and c.get("mac_hash") and c.get("ad_host")
]
cand_rows = [
(c["host"], c.get("site", ""), int(c.get("hits", 0)))
for c in candidates
if isinstance(c, dict) and c.get("host")
]
if block_rows:
store.record_ad_blocks(block_rows)
if client_rows:
store.record_ad_client_blocks(client_rows)
if cand_rows:
store.record_ad_candidates(cand_rows)
except Exception as e: # never raise into the engine's fire-and-forget POST
log.debug("ad-event ingest failed: %s", e)
return Response(status_code=204)

View File

@ -0,0 +1,68 @@
# tests/test_ad_event_candidates.py
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
"""#662 — /__toolbox/ad-event accepts a "candidates" list (the Go engine's
auto-learn feed) store.record_ad_candidates(). Never 500s the engine."""
import asyncio
import json
from secubox_toolbox import api, store
class _FakeRequest:
"""Minimal Request stand-in: headers + an async json() body."""
def __init__(self, body: dict, content_length=None):
self._body = body
cl = content_length
if cl is None:
cl = len(json.dumps(body).encode())
self.headers = {"content-length": str(cl)}
async def json(self):
return self._body
def test_candidates_ingested(monkeypatch):
captured = {}
monkeypatch.setattr(store, "record_ad_candidates", lambda rows: captured.setdefault("cand", list(rows)))
monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
body = {
"blocks": [],
"clients": [],
"candidates": [
{"host": "metrics.acotedemoi.com", "site": "lemonde.fr", "hits": 3},
{"host": "ads.foo.io", "site": "news.example", "hits": 1},
{"site": "no-host.example", "hits": 9}, # missing host → skipped
{"host": "", "site": "x", "hits": 2}, # empty host → skipped
],
}
resp = asyncio.run(api.toolbox_ad_event(_FakeRequest(body)))
assert resp.status_code == 204
rows = captured.get("cand")
assert rows == [
("metrics.acotedemoi.com", "lemonde.fr", 3),
("ads.foo.io", "news.example", 1),
]
def test_candidates_absent_is_noop(monkeypatch):
called = {"cand": False}
monkeypatch.setattr(store, "record_ad_candidates", lambda rows: called.__setitem__("cand", True))
monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
resp = asyncio.run(api.toolbox_ad_event(_FakeRequest({"blocks": [], "clients": []})))
assert resp.status_code == 204
assert called["cand"] is False # no candidates key → record_ad_candidates not called
def test_candidates_bad_payload_never_500s(monkeypatch):
monkeypatch.setattr(store, "record_ad_candidates", lambda rows: (_ for _ in ()).throw(RuntimeError("boom")))
monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
body = {"candidates": [{"host": "x.io", "site": "s", "hits": 1}]}
resp = asyncio.run(api.toolbox_ad_event(_FakeRequest(body)))
assert resp.status_code == 204 # store raised, but the endpoint swallows it

View File

@ -0,0 +1,98 @@
# tests/test_autolearn_socialfeed.py
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
"""#662 — cross-site-reuse promotion: a tracker_domain seen on >= N distinct
src_site across recent social_edges is a behaviourally-confirmed cross-site
tracker and gets promoted into learned-trackers.txt. Allowlist + self guard
reused from _ad_feed; merges (never overwrites)."""
import sqlite3
import importlib.util
import pathlib
import time
def _load_autolearn():
p = pathlib.Path(__file__).resolve().parents[1] / "sbin" / "secubox-toolbox-autolearn"
spec = importlib.util.spec_from_loader("autolearn", loader=None)
mod = importlib.util.module_from_spec(spec)
exec(compile(p.read_text(), str(p), "exec"), mod.__dict__)
return mod
def _mk_db(db):
con = sqlite3.connect(db)
con.executescript(
"CREATE TABLE social_edges("
" id INTEGER PRIMARY KEY AUTOINCREMENT, ts INTEGER NOT NULL,"
" client_mac_hash TEXT, src_site TEXT NOT NULL,"
" tracker_domain TEXT NOT NULL, cookie_id_hash TEXT,"
" ja4_hash TEXT, consent_state TEXT DEFAULT 'none_seen');")
return con
def test_social_feed_promotes_cross_site_tracker(tmp_path, monkeypatch):
db = tmp_path / "t.db"
con = _mk_db(db)
now = int(time.time())
rows = [
# tracker.io: 3 distinct src_sites (>= SOCIAL_MIN_SITES=3) → promote
(now, "m1", "cnn.com", "tracker.io"),
(now, "m1", "bbc.com", "tracker.io"),
(now, "m2", "lemonde.fr", "tracker.io"),
# twosite.net: only 2 distinct sites → NOT promoted
(now, "m1", "cnn.com", "twosite.net"),
(now, "m1", "bbc.com", "twosite.net"),
# safe.cdn.net: 3 sites but ALLOWLISTED → excluded
(now, "m1", "a.com", "safe.cdn.net"),
(now, "m1", "b.com", "safe.cdn.net"),
(now, "m1", "c.com", "safe.cdn.net"),
# secubox.in: 3 sites but SELF domain → excluded
(now, "m1", "a.com", "secubox.in"),
(now, "m1", "b.com", "secubox.in"),
(now, "m1", "c.com", "secubox.in"),
# stale.io: 3 sites but OUTSIDE the recent window → excluded
(now - 999999, "m1", "a.com", "stale.io"),
(now - 999999, "m1", "b.com", "stale.io"),
(now - 999999, "m1", "c.com", "stale.io"),
]
con.executemany(
"INSERT INTO social_edges(ts,client_mac_hash,src_site,tracker_domain) "
"VALUES(?,?,?,?)", rows)
con.commit()
con.close()
allow = tmp_path / "ad-allowlist.txt"
allow.write_text("safe.cdn.net\n")
out = tmp_path / "learned-trackers.txt"
out.write_text("preexisting.tracker.com\n")
monkeypatch.setenv("SECUBOX_AUTOLEARN_DB", str(db))
monkeypatch.setenv("SECUBOX_AUTOLEARN_OUT", str(out))
monkeypatch.setenv("SECUBOX_AD_ALLOWLIST", str(allow))
monkeypatch.setenv("SECUBOX_SOCIAL_MIN_SITES", "3")
monkeypatch.setenv("SECUBOX_SOCIAL_WINDOW_HOURS", "168")
al = _load_autolearn()
n = al._social_feed()
lines = out.read_text().split()
assert "tracker.io" in lines # 3 distinct sites, recent → promoted
assert "twosite.net" not in lines # below threshold
assert "safe.cdn.net" not in lines # allowlisted
assert "secubox.in" not in lines # self domain
assert "stale.io" not in lines # outside window
assert "preexisting.tracker.com" in lines # merge, not overwrite
assert len(lines) == len(set(lines)) # no dups
assert n == 1
def test_social_feed_no_table_is_safe(tmp_path, monkeypatch):
db = tmp_path / "empty.db"
sqlite3.connect(db).close() # no social_edges table
out = tmp_path / "learned-trackers.txt"
out.write_text("x.tracker.com\n")
monkeypatch.setenv("SECUBOX_AUTOLEARN_DB", str(db))
monkeypatch.setenv("SECUBOX_AUTOLEARN_OUT", str(out))
al = _load_autolearn()
n = al._social_feed()
assert n == -1 # gated/unavailable, not a crash
assert "x.tracker.com" in out.read_text() # file untouched