From 47f16db1909d185f7a6c5987226f64f0e2788262 Mon Sep 17 00:00:00 2001
From: frosty <frosty@illegalfirearms.store>
Date: Thu, 22 Jan 2026 12:57:27 -0500
Subject: scraping now more efficient blehhh

---
 src/Scraping/Scraping.c | 297 ++++++++++++++++++++++++------------------------
 src/Scraping/Scraping.h |  21 +++-
 2 files changed, 171 insertions(+), 147 deletions(-)

(limited to 'src/Scraping')

diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index d2afea6..c236b0f 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -8,23 +8,25 @@
 #include <time.h>
 #include <unistd.h>
 
-typedef struct {
-  char *memory;
-  size_t size;
-} MemoryBuffer;
-
 static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
                                   void *userp) {
   size_t realsize = size * nmemb;
   MemoryBuffer *mem = (MemoryBuffer *)userp;
 
-  char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1);
-  if (ptr == NULL) {
-    LOG_ERROR("Not enough memory (realloc returned NULL)");
-    return 0;
+  if (mem->size + realsize + 1 > mem->capacity) {
+
+    size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2;
+    while (new_cap < mem->size + realsize + 1) new_cap *= 2;
+
+    char *ptr = (char *)realloc(mem->memory, new_cap);
+    if (!ptr) {
+      LOG_ERROR("Not enough memory (realloc returned NULL)");
+      return 0;
+    }
+    mem->memory = ptr;
+    mem->capacity = new_cap;
   }
 
-  mem->memory = ptr;
   memcpy(&(mem->memory[mem->size]), contents, realsize);
   mem->size += realsize;
   mem->memory[mem->size] = 0;
@@ -70,8 +72,7 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
   }
 
   int num_links = xpathObj->nodesetval->nodeNr;
-  LOG_INFO("[%s] XPath matched %d potential result links", engine_name,
-           num_links);
+  LOG_INFO("[%s] XPath matched %d potential result links", engine_name, num_links);
 
   int actual_alloc = (num_links < max_results) ? num_links : max_results;
   *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
@@ -98,25 +99,22 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
              xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
         snippetRow = snippetRow->next;
       if (snippetRow) {
-        xmlXPathContextPtr subCtx = xmlXPathNewContext(doc);
-        if (subCtx) {
-          subCtx->node = snippetRow;
-          xmlXPathObjectPtr sObj = xmlXPathEvalExpression(
-              (xmlChar *)".//td[@class='result-snippet']", subCtx);
-          if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
-            snippet_text =
-                (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]);
-          }
-          if (sObj) xmlXPathFreeObject(sObj);
-          xmlXPathFreeContext(subCtx);
+
+        xpathCtx->node = snippetRow;
+        xmlXPathObjectPtr sObj = xmlXPathEvalExpression(
+            (xmlChar *)".//td[@class='result-snippet']", xpathCtx);
+        if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
+          snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]);
         }
+        if (sObj) xmlXPathFreeObject(sObj);
+        xpathCtx->node = NULL; 
+
       }
     }
 
     (*out_results)[found_count].url = strdup(url ? url : "");
     (*out_results)[found_count].title = strdup(title ? title : "No Title");
-    (*out_results)[found_count].snippet =
-        strdup(snippet_text ? snippet_text : "");
+    (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : "");
 
     LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
               (*out_results)[found_count].title);
@@ -168,16 +166,10 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc,
 
   for (int i = 0; i < num_results && found_count < max_results; i++) {
     xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
-    xmlXPathContextPtr resCtx = xmlXPathNewContext(doc);
-    if (!resCtx) {
-      LOG_ERROR("[%s] Failed to create result context for item %d", engine_name,
-                i);
-      continue;
-    }
-    resCtx->node = resultNode;
+    xpathCtx->node = resultNode;
 
     xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
-        (xmlChar *)".//a[contains(@class, 'result-link')]", resCtx);
+        (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx);
     char *url =
         (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
             ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
@@ -185,14 +177,14 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc,
             : NULL;
 
     xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
-        (xmlChar *)".//h2[contains(@class, 'wgl-title')]", resCtx);
+        (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx);
     char *title =
         (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
             ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
             : NULL;
 
     xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
-        (xmlChar *)".//p[contains(@class, 'description')]", resCtx);
+        (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx);
     char *snippet_text =
         (snippetObj && snippetObj->nodesetval &&
          snippetObj->nodesetval->nodeNr > 0)
@@ -204,8 +196,7 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc,
       (*out_results)[found_count].title = strdup(title);
       (*out_results)[found_count].snippet =
           strdup(snippet_text ? snippet_text : "");
-      LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
-                title);
+      LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, title);
       found_count++;
     } else {
       LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s",
@@ -218,9 +209,10 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc,
     if (linkObj) xmlXPathFreeObject(linkObj);
     if (titleObj) xmlXPathFreeObject(titleObj);
     if (snippetObj) xmlXPathFreeObject(snippetObj);
-    xmlXPathFreeContext(resCtx);
   }
 
+  xpathCtx->node = NULL; 
+
   xmlXPathFreeObject(xpathObj);
   xmlXPathFreeContext(xpathCtx);
   return found_count;
@@ -262,17 +254,11 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
 
   for (int i = 0; i < num_results && found_count < max_results; i++) {
     xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
-    xmlXPathContextPtr resCtx = xmlXPathNewContext(doc);
-    if (!resCtx) {
-      LOG_ERROR("[%s] Failed to create result context for item %d", engine_name,
-                i);
-      continue;
-    }
-    resCtx->node = resultNode;
+    xpathCtx->node = resultNode;
 
     xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
         (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']",
-        resCtx);
+        xpathCtx);
     char *url =
         (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
             ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
@@ -280,32 +266,26 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
             : NULL;
 
     xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
-        (xmlChar *)".//h3[contains(@class, 'title')]", resCtx);
+        (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx);
     char *title =
         (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
             ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
             : NULL;
 
     xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
-        (xmlChar *)".//div[contains(@class, 'compText')]//p", resCtx);
+        (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx);
     char *snippet_text =
         (snippetObj && snippetObj->nodesetval &&
          snippetObj->nodesetval->nodeNr > 0)
             ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
             : NULL;
 
-    if (!url || !title) {
-      LOG_DEBUG("[%s] Container %d debug - URL: %s, Title: %s", engine_name, i,
-                url ? url : "(null)", title ? title : "(null)");
-    }
-
     if (url && title) {
       (*out_results)[found_count].url = strdup(url);
       (*out_results)[found_count].title = strdup(title);
       (*out_results)[found_count].snippet =
           strdup(snippet_text ? snippet_text : "");
-      LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
-                title);
+      LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, title);
       found_count++;
     } else {
       LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s",
@@ -318,9 +298,9 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
     if (linkObj) xmlXPathFreeObject(linkObj);
     if (titleObj) xmlXPathFreeObject(titleObj);
     if (snippetObj) xmlXPathFreeObject(snippetObj);
-    xmlXPathFreeContext(resCtx);
   }
 
+  xpathCtx->node = NULL;
   xmlXPathFreeObject(xpathObj);
   xmlXPathFreeContext(xpathCtx);
   return found_count;
@@ -345,124 +325,149 @@ const SearchEngine ENGINE_REGISTRY[] = {
 
 const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
 
-int scrape_engine(const SearchEngine *engine, const char *query,
-                  SearchResult **out_results, int max_results) {
-  CURL *curl;
-  MemoryBuffer chunk = {.memory = (char *)malloc(1), .size = 0};
-  int results_count = 0;
+static void configure_curl_handle(CURL *curl, const char *full_url, 
+                                   MemoryBuffer *chunk,
+                                   struct curl_slist *headers) {
+  curl_easy_setopt(curl, CURLOPT_URL, full_url);
+  curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
+  curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
+  curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
+
+  curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+  curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
 
-  LOG_INFO("--- Starting scrape for engine: %s ---", engine->name);
-  LOG_INFO("[%s] Query: '%s'", engine->name, query);
+  curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, 300L);
+
+  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+  curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
+  curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+  curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
+}
 
-  if (!chunk.memory) {
-    LOG_ERROR("Initial memory allocation failed");
+int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
+  CURLM *multi_handle = curl_multi_init();
+  if (!multi_handle) {
+    LOG_ERROR("Failed to initialize curl_multi");
     return -1;
   }
 
-  curl = curl_easy_init();
+  for (int i = 0; i < num_jobs; i++) {
+    ScrapeJob *job = &jobs[i];
+    job->handle = curl_easy_init();
+    if (!job->handle) {
+      LOG_ERROR("[%s] Failed to init CURL handle", job->engine->name);
+      continue;
+    }
+
+    job->response.memory = (char *)malloc(16384);
+    job->response.size = 0;
+    job->response.capacity = 16384;
 
-  if (curl && query) {
     char full_url[1024];
-    char *encoded_query = curl_easy_escape(curl, query, 0);
+    char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
     if (!encoded_query) {
-      LOG_ERROR("[%s] Failed to encode query", engine->name);
-      curl_easy_cleanup(curl);
-      free(chunk.memory);
-      return -1;
+      LOG_ERROR("[%s] Failed to encode query", job->engine->name);
+      curl_easy_cleanup(job->handle);
+      job->handle = NULL;
+      continue;
     }
-    snprintf(full_url, sizeof(full_url), "%s%s", engine->base_url,
-             encoded_query);
+    snprintf(full_url, sizeof(full_url), "%s%s", job->engine->base_url, encoded_query);
     curl_free(encoded_query);
 
-    LOG_DEBUG("[%s] Requesting URL: %s", engine->name, full_url);
-
     struct curl_slist *headers = NULL;
     char host_buf[256], ref_buf[256];
-    snprintf(host_buf, sizeof(host_buf), "Host: %s", engine->host_header);
-    snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", engine->referer);
-
+    snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header);
+    snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer);
     headers = curl_slist_append(headers, host_buf);
     headers = curl_slist_append(headers, ref_buf);
-    headers = curl_slist_append(headers,
-                                "Accept: "
-                                "text/html,application/xhtml+xml,application/"
-                                "xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
+    headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
     headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
     headers = curl_slist_append(headers, "DNT: 1");
-    headers = curl_slist_append(headers, "Upgrade-Insecure-Requests: 1");
-    headers = curl_slist_append(headers, "Sec-Fetch-Dest: document");
-    headers = curl_slist_append(headers, "Sec-Fetch-Mode: navigate");
-    headers = curl_slist_append(headers, "Sec-Fetch-Site: same-origin");
-    headers = curl_slist_append(headers, "Connection: keep-alive");
 
-    curl_easy_setopt(curl, CURLOPT_URL, full_url);
-    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
-    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
-    curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
+    configure_curl_handle(job->handle, full_url, &job->response, headers);
 
-    const char *ua = get_random_user_agent();
-    LOG_DEBUG("[%s] Using User-Agent: %s", engine->name, ua);
-    curl_easy_setopt(curl, CURLOPT_USERAGENT, ua);
+    curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
 
-    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
-    curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+    curl_multi_add_handle(multi_handle, job->handle);
+    LOG_INFO("[%s] Added to parallel queue", job->engine->name);
+  }
 
-    curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
+  usleep(100000 + (rand() % 100000)); 
 
-    LOG_DEBUG("[%s] Waiting for rate-limit jitter...", engine->name);
-    usleep(500000 + (rand() % 1000000));
+  int still_running = 0;
+  curl_multi_perform(multi_handle, &still_running);
 
-    CURLcode res = curl_easy_perform(curl);
+  do {
+    int numfds = 0;
+    CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
 
-    if (res != CURLE_OK) {
-      LOG_ERROR("[%s] libcurl error: %s", engine->name,
-                curl_easy_strerror(res));
-    } else {
-      long response_code;
-      curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
-      LOG_INFO("[%s] HTTP Response Code: %ld", engine->name, response_code);
-
-      if (chunk.size > 0) {
-        xmlDocPtr doc = htmlReadMemory(
-            chunk.memory, chunk.size, NULL, NULL,
-            HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
-        if (doc) {
-          results_count =
-              engine->parser(engine->name, doc, out_results, max_results);
-          xmlFreeDoc(doc);
-        }
-      }
+    if (mc != CURLM_OK) {
+      LOG_ERROR("curl_multi_wait() failed: %s", curl_multi_strerror(mc));
+      break;
     }
 
-    if (results_count <= 0) {
-      LOG_WARN("[%s] No results found. Generating skeleton fallback.",
-               engine->name);
-      *out_results = (SearchResult *)malloc(sizeof(SearchResult));
-      if (*out_results) {
-        char fallback_msg[512];
-        snprintf(fallback_msg, sizeof(fallback_msg),
-                 "Search %s manually for '%s'", engine->name, query);
-
-        (*out_results)[0].title = strdup(fallback_msg);
-        (*out_results)[0].url = strdup(full_url);
-        (*out_results)[0].snippet = strdup(
-            "Automated results were blocked by a Captcha or anti-bot "
-            "challenge. Click the link above to perform the search "
-            "manually in your browser.");
-        results_count = 1;
-      }
-    }
+    curl_multi_perform(multi_handle, &still_running);
+  } while (still_running);
+
+  CURLMsg *msg;
+  int msgs_left;
+  while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
+    if (msg->msg == CURLMSG_DONE) {
+      CURL *handle = msg->easy_handle;
+
+      for (int i = 0; i < num_jobs; i++) {
+        if (jobs[i].handle == handle) {
+          ScrapeJob *job = &jobs[i];
+
+          long response_code;
+          curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
+          LOG_INFO("[%s] HTTP Response Code: %ld", job->engine->name, response_code);
+
+          if (msg->data.result == CURLE_OK && job->response.size > 0) {
+            xmlDocPtr doc = htmlReadMemory(
+                job->response.memory, job->response.size, NULL, NULL,
+                HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+
+            if (doc) {
+              job->results_count = job->engine->parser(
+                  job->engine->name, doc, job->out_results, job->max_results);
+              xmlFreeDoc(doc);
+            }
+          } else {
+            LOG_ERROR("[%s] Request failed: %s", job->engine->name,
+                     curl_easy_strerror(msg->data.result));
+            job->results_count = 0;
+          }
 
-    curl_slist_free_all(headers);
-    curl_easy_cleanup(curl);
-  } else {
-    if (curl) {
-      curl_easy_cleanup(curl);
+          struct curl_slist *headers;
+          curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
+          if (headers) curl_slist_free_all(headers);
+
+          free(job->response.memory);
+          curl_multi_remove_handle(multi_handle, handle);
+          curl_easy_cleanup(handle);
+          break;
+        }
+      }
     }
   }
 
-  free(chunk.memory);
-
-  return results_count;
+  curl_multi_cleanup(multi_handle);
+  return 0;
 }
+
+int scrape_engine(const SearchEngine *engine, const char *query,
+                  SearchResult **out_results, int max_results) {
+  ScrapeJob job = {
+      .engine = engine,
+      .query = (char *)query,
+      .out_results = out_results,
+      .max_results = max_results,
+      .results_count = 0
+  };
+
+  scrape_engines_parallel(&job, 1);
+  return job.results_count;
+}
\ No newline at end of file
diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h
index 7ad4d59..43e22d9 100644
--- a/src/Scraping/Scraping.h
+++ b/src/Scraping/Scraping.h
@@ -2,6 +2,7 @@
 #define SCRAPING_H
 
 #include <libxml/HTMLparser.h>
+#include <curl/curl.h>
 
 #define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__)
 #define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__)
@@ -25,10 +26,28 @@ typedef struct {
   ParserFunc parser;
 } SearchEngine;
 
+typedef struct {
+  char *memory;
+  size_t size;
+  size_t capacity;
+} MemoryBuffer;
+
+typedef struct {
+  const SearchEngine *engine;
+  char *query;
+  SearchResult **out_results;
+  int max_results;
+  CURL *handle;
+  MemoryBuffer response;
+  int results_count;
+} ScrapeJob;
+
 extern const SearchEngine ENGINE_REGISTRY[];
 extern const int ENGINE_COUNT;
 
 int scrape_engine(const SearchEngine *engine, const char *query,
                   SearchResult **out_results, int max_results);
 
-#endif
+int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs);
+
+#endif
\ No newline at end of file
-- 
cgit v1.2.3