From 5680113281aa58b63b1bdd7445a17e281007df23 Mon Sep 17 00:00:00 2001 From: Max Resnick Date: Fri, 28 Feb 2025 22:59:23 -0800 Subject: feat: ad domain parser --- README.md | 66 ++++++++++++++++++++++++ main.go | 114 ++++++++++++++++++++++++++++++++++++++++++ main_test.go | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 341 insertions(+) create mode 100644 README.md create mode 100644 main.go create mode 100644 main_test.go diff --git a/README.md b/README.md new file mode 100644 index 0000000..b06b0c2 --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +# Unbound Ads Generator + +A tool to generate Unbound DNS server configuration for blocking advertising domains. + +## Overview + +This program fetches lists of advertising domains from multiple sources and generates a configuration file for the Unbound DNS server to block these domains. It supports multiple input formats and automatically deduplicates domains. + +## Usage + +```bash +unbound-ads-generator +``` + +Where: +- `url-list` is a URL pointing to a text file containing a list of URLs (one per line) that provide domain lists +- `output-file` is the path where the Unbound configuration will be written + +### Input Format + +The URL list file should contain URLs (one per line) pointing to domain lists. Lines starting with # are treated as comments. + +Example URL list file: +``` +# Ad blocking lists +https://raw.githubusercontent.com/PolishFiltersTeam/KADhosts/master/KADhosts.txt +https://raw.githubusercontent.com/FadeMind/hosts.extras/master/add.Spam/hosts +``` + +The domain lists can be in either of these formats: +``` +# IP and domain format +0.0.0.0 advertising.example.com + +# Plain domain format +advertising.example.com +``` + +### Output Format + +The program generates Unbound configuration in this format: +``` +local-zone: "advertising.example.com" refuse +local-zone: "another-ad.example.com" refuse +``` + +## Features + +- Supports multiple domain list formats +- Automatically deduplicates domains +- Case-insensitive domain matching +- Progress logging +- Comment handling +- Basic domain validation + +## Building + +```bash +go build +``` + +## Testing + +```bash +go test -v +``` diff --git a/main.go b/main.go new file mode 100644 index 0000000..5bb387d --- /dev/null +++ b/main.go @@ -0,0 +1,114 @@ +package main + +import ( + "bufio" + "fmt" + "log/slog" + "net/http" + "os" + "strings" +) + +func main() { + if len(os.Args) != 3 { + slog.Error("usage: program ") + os.Exit(1) + } + + urls, err := fetchURLList(os.Args[1]) + if err != nil { + slog.Error("failed to fetch URL list", "error", err) + os.Exit(1) + } + + f, err := os.Create(os.Args[2]) + if err != nil { + slog.Error("failed to create output file", "error", err) + os.Exit(1) + } + defer f.Close() + + w := bufio.NewWriter(f) + defer w.Flush() + + domains := make(map[string]struct{}) + for i, url := range urls { + slog.Info("fetching domains", "url", url, "progress", fmt.Sprintf("%d/%d", i+1, len(urls))) + if err := fetchDomainsAndWrite(url, w, domains); err != nil { + slog.Warn("failed to process url", "url", url, "error", err) + continue + } + } + slog.Info("completed", "total_domains", len(domains)) +} + +func fetchURLList(url string) ([]string, error) { + resp, err := http.Get(url) + if err != nil { + return nil, fmt.Errorf("http get failed: %w", err) + } + defer resp.Body.Close() + + var urls []string + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + urls = append(urls, line) + } + return urls, scanner.Err() +} + +func fetchDomainsAndWrite(url string, w *bufio.Writer, seen map[string]struct{}) error { + resp, err := http.Get(url) + if err != nil { + return fmt.Errorf("http get failed: %w", err) + } + defer resp.Body.Close() + + var count int + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + + var domain string + // Handle "0.0.0.0 domain.com" format + if strings.Contains(line, " ") { + parts := strings.Fields(line) + if len(parts) >= 2 { + domain = parts[1] + } + } else { + domain = line + } + + // Basic domain validation and normalization + domain = strings.ToLower(strings.TrimSpace(domain)) + if domain == "" || !strings.Contains(domain, ".") || strings.HasPrefix(domain, ".") || strings.HasSuffix(domain, ".") { + continue + } + + // Skip if we've seen this domain before + if _, exists := seen[domain]; exists { + continue + } + seen[domain] = struct{}{} + count++ + + if _, err := fmt.Fprintf(w, "local-zone: %q refuse\n", domain); err != nil { + return fmt.Errorf("failed to write domain: %w", err) + } + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("scanner error: %w", err) + } + + slog.Info("processed url", "url", url, "new_domains", count) + return nil +} diff --git a/main_test.go b/main_test.go new file mode 100644 index 0000000..29a8341 --- /dev/null +++ b/main_test.go @@ -0,0 +1,161 @@ +package main + +import ( + "bufio" + "bytes" + "net/http" + "net/http/httptest" + "testing" +) + +func TestFetchURLList(t *testing.T) { + tests := []struct { + name string + content string + want []string + wantErr bool + }{ + { + name: "basic list with comments", + content: `# comment +https://example.com/1 +https://example.com/2 +# another comment +https://example.com/3`, + want: []string{ + "https://example.com/1", + "https://example.com/2", + "https://example.com/3", + }, + }, + { + name: "empty lines and whitespace", + content: ` https://example.com/1 + +https://example.com/2`, + want: []string{ + "https://example.com/1", + "https://example.com/2", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(tt.content)) + })) + defer ts.Close() + + got, err := fetchURLList(ts.URL) + if (err != nil) != tt.wantErr { + t.Errorf("fetchURLList() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !stringSliceEqual(got, tt.want) { + t.Errorf("fetchURLList() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestFetchDomainsAndWrite(t *testing.T) { + tests := []struct { + name string + content string + want string + wantSeen map[string]struct{} + }{ + { + name: "ip and domain format", + content: `# comment +0.0.0.0 domain1.com +0.0.0.0 domain2.com`, + want: `local-zone: "domain1.com" refuse +local-zone: "domain2.com" refuse +`, + wantSeen: map[string]struct{}{ + "domain1.com": {}, + "domain2.com": {}, + }, + }, + { + name: "plain domain format", + content: `# comment +domain1.com +domain2.com`, + want: `local-zone: "domain1.com" refuse +local-zone: "domain2.com" refuse +`, + wantSeen: map[string]struct{}{ + "domain1.com": {}, + "domain2.com": {}, + }, + }, + { + name: "mixed format with duplicates", + content: `domain1.com +0.0.0.0 domain1.com +0.0.0.0 DOMAIN1.COM +domain2.com`, + want: `local-zone: "domain1.com" refuse +local-zone: "domain2.com" refuse +`, + wantSeen: map[string]struct{}{ + "domain1.com": {}, + "domain2.com": {}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte(tt.content)) + })) + defer ts.Close() + + var buf bytes.Buffer + w := bufio.NewWriter(&buf) + seen := make(map[string]struct{}) + + err := fetchDomainsAndWrite(ts.URL, w, seen) + if err != nil { + t.Fatalf("fetchDomainsAndWrite() error = %v", err) + } + w.Flush() + + if got := buf.String(); got != tt.want { + t.Errorf("fetchDomainsAndWrite() output = %q, want %q", got, tt.want) + } + + if !mapEqual(seen, tt.wantSeen) { + t.Errorf("fetchDomainsAndWrite() seen = %v, want %v", seen, tt.wantSeen) + } + }) + } +} + +func stringSliceEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func mapEqual(a, b map[string]struct{}) bool { + if len(a) != len(b) { + return false + } + for k := range a { + if _, ok := b[k]; !ok { + return false + } + } + return true +} -- cgit v1.2.3