From e33310f26351e25fda718a353f8b8e9ece0007b5 Mon Sep 17 00:00:00 2001
From: frosty <gabriel@bwaaa.monster>
Date: Tue, 10 Mar 2026 03:40:34 -0400
Subject: feature: added caching

---
 src/Scraping/Scraping.c | 73 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 14 deletions(-)

(limited to 'src/Scraping/Scraping.c')

diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index a9ad913..7ba2d97 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -1,4 +1,5 @@
 #include "Scraping.h"
+#include "../Cache/Cache.h"
 #include "../Proxy/Proxy.h"
 #include "../Utility/Unescape.h"
 #include <curl/curl.h>
@@ -368,6 +369,10 @@ retry:
   for (int i = 0; i < num_jobs; i++) {
     ScrapeJob *job = &jobs[i];
 
+    char cache_key[64];
+    char full_url[1024];
+    char *encoded_query = NULL;
+
     if (job->handle) {
       curl_easy_cleanup(job->handle);
       job->handle = NULL;
@@ -376,20 +381,8 @@ retry:
       free(job->response.memory);
     }
 
-    job->handle = curl_easy_init();
-    if (!job->handle) {
-      continue;
-    }
-
-    job->response.memory = (char *)malloc(16384);
-    job->response.size = 0;
-    job->response.capacity = 16384;
-
-    char full_url[1024];
-    char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
+    encoded_query = curl_easy_escape(NULL, job->query, 0);
     if (!encoded_query) {
-      curl_easy_cleanup(job->handle);
-      job->handle = NULL;
       continue;
     }
 
@@ -399,7 +392,52 @@ retry:
 
     snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
              encoded_query, job->engine->page_param, page_value);
-    curl_free(encoded_query);
+
+    char *key = cache_compute_key(job->query, job->page, job->engine->name);
+    if (key) {
+      strncpy(cache_key, key, sizeof(cache_key) - 1);
+      cache_key[sizeof(cache_key) - 1] = '\0';
+      free(key);
+    } else {
+      snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
+               job->engine->name);
+    }
+
+    char *cached_data = NULL;
+    size_t cached_size = 0;
+    int cache_hit = 0;
+
+    if (get_cache_ttl_search() > 0 &&
+        cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
+                  &cached_size) == 0 &&
+        cached_data && cached_size > 0) {
+      xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
+                                     HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+                                         HTML_PARSE_NOWARNING);
+      if (doc) {
+        job->results_count = job->engine->parser(
+            job->engine->name, doc, job->out_results, job->max_results);
+        xmlFreeDoc(doc);
+        cache_hit = 1;
+      }
+      free(cached_data);
+    }
+
+    if (cache_hit) {
+      free(encoded_query);
+      job->results_count = job->results_count > 0 ? job->results_count : 0;
+      continue;
+    }
+
+    job->handle = curl_easy_init();
+    if (!job->handle) {
+      free(encoded_query);
+      continue;
+    }
+
+    job->response.memory = (char *)malloc(16384);
+    job->response.size = 0;
+    job->response.capacity = 16384;
 
     struct curl_slist *headers = NULL;
     char host_buf[256], ref_buf[256];
@@ -451,6 +489,13 @@ retry:
           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
 
           if (msg->data.result == CURLE_OK && job->response.size > 0) {
+            char *key =
+                cache_compute_key(job->query, job->page, job->engine->name);
+            if (key && get_cache_ttl_search() > 0) {
+              cache_set(key, job->response.memory, job->response.size);
+              free(key);
+            }
+
             xmlDocPtr doc = htmlReadMemory(
                 job->response.memory, job->response.size, NULL, NULL,
                 HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
-- 
cgit v1.2.3