From e33310f26351e25fda718a353f8b8e9ece0007b5 Mon Sep 17 00:00:00 2001 From: frosty Date: Tue, 10 Mar 2026 03:40:34 -0400 Subject: feature: added caching --- src/Scraping/Scraping.c | 73 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 14 deletions(-) (limited to 'src/Scraping/Scraping.c') diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index a9ad913..7ba2d97 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -1,4 +1,5 @@ #include "Scraping.h" +#include "../Cache/Cache.h" #include "../Proxy/Proxy.h" #include "../Utility/Unescape.h" #include @@ -368,6 +369,10 @@ retry: for (int i = 0; i < num_jobs; i++) { ScrapeJob *job = &jobs[i]; + char cache_key[64]; + char full_url[1024]; + char *encoded_query = NULL; + if (job->handle) { curl_easy_cleanup(job->handle); job->handle = NULL; @@ -376,20 +381,8 @@ retry: free(job->response.memory); } - job->handle = curl_easy_init(); - if (!job->handle) { - continue; - } - - job->response.memory = (char *)malloc(16384); - job->response.size = 0; - job->response.capacity = 16384; - - char full_url[1024]; - char *encoded_query = curl_easy_escape(job->handle, job->query, 0); + encoded_query = curl_easy_escape(NULL, job->query, 0); if (!encoded_query) { - curl_easy_cleanup(job->handle); - job->handle = NULL; continue; } @@ -399,7 +392,52 @@ retry: snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url, encoded_query, job->engine->page_param, page_value); - curl_free(encoded_query); + + char *key = cache_compute_key(job->query, job->page, job->engine->name); + if (key) { + strncpy(cache_key, key, sizeof(cache_key) - 1); + cache_key[sizeof(cache_key) - 1] = '\0'; + free(key); + } else { + snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i, + job->engine->name); + } + + char *cached_data = NULL; + size_t cached_size = 0; + int cache_hit = 0; + + if (get_cache_ttl_search() > 0 && + cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data, + &cached_size) == 0 && + cached_data && cached_size > 0) { + xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING); + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); + cache_hit = 1; + } + free(cached_data); + } + + if (cache_hit) { + free(encoded_query); + job->results_count = job->results_count > 0 ? job->results_count : 0; + continue; + } + + job->handle = curl_easy_init(); + if (!job->handle) { + free(encoded_query); + continue; + } + + job->response.memory = (char *)malloc(16384); + job->response.size = 0; + job->response.capacity = 16384; struct curl_slist *headers = NULL; char host_buf[256], ref_buf[256]; @@ -451,6 +489,13 @@ retry: curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code); if (msg->data.result == CURLE_OK && job->response.size > 0) { + char *key = + cache_compute_key(job->query, job->page, job->engine->name); + if (key && get_cache_ttl_search() > 0) { + cache_set(key, job->response.memory, job->response.size); + free(key); + } + xmlDocPtr doc = htmlReadMemory( job->response.memory, job->response.size, NULL, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); -- cgit v1.2.3