aboutsummaryrefslogtreecommitdiff
path: root/src/Scraping
diff options
context:
space:
mode:
authorfrosty <frosty@illegalfirearms.store>2026-01-06 23:46:24 -0500
committerfrosty <frosty@illegalfirearms.store>2026-01-06 23:46:24 -0500
commitf3aa7ca0bc2ef7c286609e8f87d07cc2568093af (patch)
tree269352af1238b4dd7c3e2e023f71a27b858cdb34 /src/Scraping
rebase(d)
Diffstat (limited to 'src/Scraping')
-rw-r--r--src/Scraping/Scraping.c468
-rw-r--r--src/Scraping/Scraping.h34
2 files changed, 502 insertions, 0 deletions
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
new file mode 100644
index 0000000..d2afea6
--- /dev/null
+++ b/src/Scraping/Scraping.c
@@ -0,0 +1,468 @@
+#include "Scraping.h"
+#include <curl/curl.h>
+#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+typedef struct {
+ char *memory;
+ size_t size;
+} MemoryBuffer;
+
+static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
+ void *userp) {
+ size_t realsize = size * nmemb;
+ MemoryBuffer *mem = (MemoryBuffer *)userp;
+
+ char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1);
+ if (ptr == NULL) {
+ LOG_ERROR("Not enough memory (realloc returned NULL)");
+ return 0;
+ }
+
+ mem->memory = ptr;
+ memcpy(&(mem->memory[mem->size]), contents, realsize);
+ mem->size += realsize;
+ mem->memory[mem->size] = 0;
+
+ return realsize;
+}
+
+static const char *get_random_user_agent() {
+ static const char *agents[] = {
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
+ "like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
+ "Gecko) "
+ "Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
+ "Firefox/121.0",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
+ "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
+ return agents[rand() % 5];
+}
+
+static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ LOG_DEBUG("[%s] Starting XPath parsing...", engine_name);
+ int found_count = 0;
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ if (!xpathCtx) {
+ LOG_ERROR("[%s] Could not create XPath context", engine_name);
+ return 0;
+ }
+
+ const char *link_xpath = "//a[@class='result-link']";
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx);
+
+ if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
+ LOG_WARN("[%s] No results found with XPath: %s", engine_name, link_xpath);
+ if (xpathObj) xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ int num_links = xpathObj->nodesetval->nodeNr;
+ LOG_INFO("[%s] XPath matched %d potential result links", engine_name,
+ num_links);
+
+ int actual_alloc = (num_links < max_results) ? num_links : max_results;
+ *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
+ if (!*out_results) {
+ LOG_ERROR("[%s] Failed to allocate memory for results", engine_name);
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ for (int i = 0; i < num_links && found_count < max_results; i++) {
+ xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i];
+ char *title = (char *)xmlNodeGetContent(linkNode);
+ char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href");
+ char *snippet_text = NULL;
+
+ xmlNodePtr current = linkNode->parent;
+ while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
+ current = current->parent;
+
+ if (current && current->next) {
+ xmlNodePtr snippetRow = current->next;
+ while (snippetRow &&
+ xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
+ snippetRow = snippetRow->next;
+ if (snippetRow) {
+ xmlXPathContextPtr subCtx = xmlXPathNewContext(doc);
+ if (subCtx) {
+ subCtx->node = snippetRow;
+ xmlXPathObjectPtr sObj = xmlXPathEvalExpression(
+ (xmlChar *)".//td[@class='result-snippet']", subCtx);
+ if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
+ snippet_text =
+ (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]);
+ }
+ if (sObj) xmlXPathFreeObject(sObj);
+ xmlXPathFreeContext(subCtx);
+ }
+ }
+ }
+
+ (*out_results)[found_count].url = strdup(url ? url : "");
+ (*out_results)[found_count].title = strdup(title ? title : "No Title");
+ (*out_results)[found_count].snippet =
+ strdup(snippet_text ? snippet_text : "");
+
+ LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
+ (*out_results)[found_count].title);
+ found_count++;
+
+ if (title) xmlFree(title);
+ if (url) xmlFree(url);
+ if (snippet_text) xmlFree(snippet_text);
+ }
+
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return found_count;
+}
+
+static int parse_startpage(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ LOG_DEBUG("[%s] Starting XPath parsing...", engine_name);
+ int found_count = 0;
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ if (!xpathCtx) {
+ LOG_ERROR("[%s] Could not create XPath context", engine_name);
+ return 0;
+ }
+
+ const char *container_xpath = "//div[contains(@class, 'result')]";
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx);
+
+ if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
+ LOG_WARN("[%s] No result containers found with XPath: %s", engine_name,
+ container_xpath);
+ if (xpathObj) xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ int num_results = xpathObj->nodesetval->nodeNr;
+ LOG_INFO("[%s] Found %d result containers", engine_name, num_results);
+
+ int actual_alloc = (num_results < max_results) ? num_results : max_results;
+ *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
+ if (!*out_results) {
+ LOG_ERROR("[%s] Failed to allocate memory for results", engine_name);
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ for (int i = 0; i < num_results && found_count < max_results; i++) {
+ xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
+ xmlXPathContextPtr resCtx = xmlXPathNewContext(doc);
+ if (!resCtx) {
+ LOG_ERROR("[%s] Failed to create result context for item %d", engine_name,
+ i);
+ continue;
+ }
+ resCtx->node = resultNode;
+
+ xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
+ (xmlChar *)".//a[contains(@class, 'result-link')]", resCtx);
+ char *url =
+ (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
+ (xmlChar *)"href")
+ : NULL;
+
+ xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
+ (xmlChar *)".//h2[contains(@class, 'wgl-title')]", resCtx);
+ char *title =
+ (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
+ : NULL;
+
+ xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
+ (xmlChar *)".//p[contains(@class, 'description')]", resCtx);
+ char *snippet_text =
+ (snippetObj && snippetObj->nodesetval &&
+ snippetObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
+ : NULL;
+
+ if (url && title) {
+ (*out_results)[found_count].url = strdup(url);
+ (*out_results)[found_count].title = strdup(title);
+ (*out_results)[found_count].snippet =
+ strdup(snippet_text ? snippet_text : "");
+ LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
+ title);
+ found_count++;
+ } else {
+ LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s",
+ engine_name, i, url ? "Yes" : "No", title ? "Yes" : "No");
+ }
+
+ if (title) xmlFree(title);
+ if (url) xmlFree(url);
+ if (snippet_text) xmlFree(snippet_text);
+ if (linkObj) xmlXPathFreeObject(linkObj);
+ if (titleObj) xmlXPathFreeObject(titleObj);
+ if (snippetObj) xmlXPathFreeObject(snippetObj);
+ xmlXPathFreeContext(resCtx);
+ }
+
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return found_count;
+}
+
+static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ LOG_DEBUG("[%s] Starting XPath parsing...", engine_name);
+ int found_count = 0;
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ if (!xpathCtx) {
+ LOG_ERROR("[%s] Could not create XPath context", engine_name);
+ return 0;
+ }
+
+ const char *container_xpath = "//div[contains(@class, 'algo-sr')]";
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx);
+
+ if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
+ LOG_WARN("[%s] No result containers found with XPath: %s", engine_name,
+ container_xpath);
+ if (xpathObj) xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ int num_results = xpathObj->nodesetval->nodeNr;
+ LOG_INFO("[%s] Found %d result containers", engine_name, num_results);
+
+ int actual_alloc = (num_results < max_results) ? num_results : max_results;
+ *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
+ if (!*out_results) {
+ LOG_ERROR("[%s] Failed to allocate memory for results", engine_name);
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ for (int i = 0; i < num_results && found_count < max_results; i++) {
+ xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
+ xmlXPathContextPtr resCtx = xmlXPathNewContext(doc);
+ if (!resCtx) {
+ LOG_ERROR("[%s] Failed to create result context for item %d", engine_name,
+ i);
+ continue;
+ }
+ resCtx->node = resultNode;
+
+ xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
+ (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']",
+ resCtx);
+ char *url =
+ (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
+ (xmlChar *)"href")
+ : NULL;
+
+ xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
+ (xmlChar *)".//h3[contains(@class, 'title')]", resCtx);
+ char *title =
+ (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
+ : NULL;
+
+ xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
+ (xmlChar *)".//div[contains(@class, 'compText')]//p", resCtx);
+ char *snippet_text =
+ (snippetObj && snippetObj->nodesetval &&
+ snippetObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
+ : NULL;
+
+ if (!url || !title) {
+ LOG_DEBUG("[%s] Container %d debug - URL: %s, Title: %s", engine_name, i,
+ url ? url : "(null)", title ? title : "(null)");
+ }
+
+ if (url && title) {
+ (*out_results)[found_count].url = strdup(url);
+ (*out_results)[found_count].title = strdup(title);
+ (*out_results)[found_count].snippet =
+ strdup(snippet_text ? snippet_text : "");
+ LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
+ title);
+ found_count++;
+ } else {
+ LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s",
+ engine_name, i, url ? "Yes" : "No", title ? "Yes" : "No");
+ }
+
+ if (title) xmlFree(title);
+ if (url) xmlFree(url);
+ if (snippet_text) xmlFree(snippet_text);
+ if (linkObj) xmlXPathFreeObject(linkObj);
+ if (titleObj) xmlXPathFreeObject(titleObj);
+ if (snippetObj) xmlXPathFreeObject(snippetObj);
+ xmlXPathFreeContext(resCtx);
+ }
+
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return found_count;
+}
+
+const SearchEngine ENGINE_REGISTRY[] = {
+ {.name = "DuckDuckGo Lite",
+ .base_url = "https://lite.duckduckgo.com/lite/?q=",
+ .host_header = "lite.duckduckgo.com",
+ .referer = "https://lite.duckduckgo.com/",
+ .parser = parse_ddg_lite},
+ {.name = "Startpage",
+ .base_url = "https://www.startpage.com/sp/search?q=",
+ .host_header = "www.startpage.com",
+ .referer = "https://www.startpage.com/",
+ .parser = parse_startpage},
+ {.name = "Yahoo",
+ .base_url = "https://search.yahoo.com/search?p=",
+ .host_header = "search.yahoo.com",
+ .referer = "https://search.yahoo.com/",
+ .parser = parse_yahoo}};
+
+const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
+
+int scrape_engine(const SearchEngine *engine, const char *query,
+ SearchResult **out_results, int max_results) {
+ CURL *curl;
+ MemoryBuffer chunk = {.memory = (char *)malloc(1), .size = 0};
+ int results_count = 0;
+
+ LOG_INFO("--- Starting scrape for engine: %s ---", engine->name);
+ LOG_INFO("[%s] Query: '%s'", engine->name, query);
+
+ if (!chunk.memory) {
+ LOG_ERROR("Initial memory allocation failed");
+ return -1;
+ }
+
+ curl = curl_easy_init();
+
+ if (curl && query) {
+ char full_url[1024];
+ char *encoded_query = curl_easy_escape(curl, query, 0);
+ if (!encoded_query) {
+ LOG_ERROR("[%s] Failed to encode query", engine->name);
+ curl_easy_cleanup(curl);
+ free(chunk.memory);
+ return -1;
+ }
+ snprintf(full_url, sizeof(full_url), "%s%s", engine->base_url,
+ encoded_query);
+ curl_free(encoded_query);
+
+ LOG_DEBUG("[%s] Requesting URL: %s", engine->name, full_url);
+
+ struct curl_slist *headers = NULL;
+ char host_buf[256], ref_buf[256];
+ snprintf(host_buf, sizeof(host_buf), "Host: %s", engine->host_header);
+ snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", engine->referer);
+
+ headers = curl_slist_append(headers, host_buf);
+ headers = curl_slist_append(headers, ref_buf);
+ headers = curl_slist_append(headers,
+ "Accept: "
+ "text/html,application/xhtml+xml,application/"
+ "xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
+ headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
+ headers = curl_slist_append(headers, "DNT: 1");
+ headers = curl_slist_append(headers, "Upgrade-Insecure-Requests: 1");
+ headers = curl_slist_append(headers, "Sec-Fetch-Dest: document");
+ headers = curl_slist_append(headers, "Sec-Fetch-Mode: navigate");
+ headers = curl_slist_append(headers, "Sec-Fetch-Site: same-origin");
+ headers = curl_slist_append(headers, "Connection: keep-alive");
+
+ curl_easy_setopt(curl, CURLOPT_URL, full_url);
+ curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
+
+ const char *ua = get_random_user_agent();
+ LOG_DEBUG("[%s] Using User-Agent: %s", engine->name, ua);
+ curl_easy_setopt(curl, CURLOPT_USERAGENT, ua);
+
+ curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+ curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
+ curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+
+ curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
+
+ LOG_DEBUG("[%s] Waiting for rate-limit jitter...", engine->name);
+ usleep(500000 + (rand() % 1000000));
+
+ CURLcode res = curl_easy_perform(curl);
+
+ if (res != CURLE_OK) {
+ LOG_ERROR("[%s] libcurl error: %s", engine->name,
+ curl_easy_strerror(res));
+ } else {
+ long response_code;
+ curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
+ LOG_INFO("[%s] HTTP Response Code: %ld", engine->name, response_code);
+
+ if (chunk.size > 0) {
+ xmlDocPtr doc = htmlReadMemory(
+ chunk.memory, chunk.size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+ if (doc) {
+ results_count =
+ engine->parser(engine->name, doc, out_results, max_results);
+ xmlFreeDoc(doc);
+ }
+ }
+ }
+
+ if (results_count <= 0) {
+ LOG_WARN("[%s] No results found. Generating skeleton fallback.",
+ engine->name);
+ *out_results = (SearchResult *)malloc(sizeof(SearchResult));
+ if (*out_results) {
+ char fallback_msg[512];
+ snprintf(fallback_msg, sizeof(fallback_msg),
+ "Search %s manually for '%s'", engine->name, query);
+
+ (*out_results)[0].title = strdup(fallback_msg);
+ (*out_results)[0].url = strdup(full_url);
+ (*out_results)[0].snippet = strdup(
+ "Automated results were blocked by a Captcha or anti-bot "
+ "challenge. Click the link above to perform the search "
+ "manually in your browser.");
+ results_count = 1;
+ }
+ }
+
+ curl_slist_free_all(headers);
+ curl_easy_cleanup(curl);
+ } else {
+ if (curl) {
+ curl_easy_cleanup(curl);
+ }
+ }
+
+ free(chunk.memory);
+
+ return results_count;
+}
diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h
new file mode 100644
index 0000000..7ad4d59
--- /dev/null
+++ b/src/Scraping/Scraping.h
@@ -0,0 +1,34 @@
+#ifndef SCRAPING_H
+#define SCRAPING_H
+
+#include <libxml/HTMLparser.h>
+
+#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__)
+#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__)
+#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__)
+#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__)
+
+typedef struct {
+ char *url;
+ char *title;
+ char *snippet;
+} SearchResult;
+
+typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results);
+
+typedef struct {
+ const char *name;
+ const char *base_url;
+ const char *host_header;
+ const char *referer;
+ ParserFunc parser;
+} SearchEngine;
+
+extern const SearchEngine ENGINE_REGISTRY[];
+extern const int ENGINE_COUNT;
+
+int scrape_engine(const SearchEngine *engine, const char *query,
+ SearchResult **out_results, int max_results);
+
+#endif