aboutsummaryrefslogtreecommitdiff
path: root/src/Scraping/Scraping.c
diff options
context:
space:
mode:
authorfrosty <gabriel@bwaaa.monster>2026-03-10 03:40:34 -0400
committerfrosty <gabriel@bwaaa.monster>2026-03-10 03:40:34 -0400
commite33310f26351e25fda718a353f8b8e9ece0007b5 (patch)
treeac371b6c35bbdda348afbfcfd05ac5ee2c372ab7 /src/Scraping/Scraping.c
parenta11bf8bb6c1baaef51c25d441f5348567280967b (diff)
downloadomnisearch-e33310f26351e25fda718a353f8b8e9ece0007b5.tar.gz
feature: added caching
Diffstat (limited to 'src/Scraping/Scraping.c')
-rw-r--r--src/Scraping/Scraping.c73
1 files changed, 59 insertions, 14 deletions
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index a9ad913..7ba2d97 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -1,4 +1,5 @@
#include "Scraping.h"
+#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Utility/Unescape.h"
#include <curl/curl.h>
@@ -368,6 +369,10 @@ retry:
for (int i = 0; i < num_jobs; i++) {
ScrapeJob *job = &jobs[i];
+ char cache_key[64];
+ char full_url[1024];
+ char *encoded_query = NULL;
+
if (job->handle) {
curl_easy_cleanup(job->handle);
job->handle = NULL;
@@ -376,20 +381,8 @@ retry:
free(job->response.memory);
}
- job->handle = curl_easy_init();
- if (!job->handle) {
- continue;
- }
-
- job->response.memory = (char *)malloc(16384);
- job->response.size = 0;
- job->response.capacity = 16384;
-
- char full_url[1024];
- char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
+ encoded_query = curl_easy_escape(NULL, job->query, 0);
if (!encoded_query) {
- curl_easy_cleanup(job->handle);
- job->handle = NULL;
continue;
}
@@ -399,7 +392,52 @@ retry:
snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
encoded_query, job->engine->page_param, page_value);
- curl_free(encoded_query);
+
+ char *key = cache_compute_key(job->query, job->page, job->engine->name);
+ if (key) {
+ strncpy(cache_key, key, sizeof(cache_key) - 1);
+ cache_key[sizeof(cache_key) - 1] = '\0';
+ free(key);
+ } else {
+ snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
+ job->engine->name);
+ }
+
+ char *cached_data = NULL;
+ size_t cached_size = 0;
+ int cache_hit = 0;
+
+ if (get_cache_ttl_search() > 0 &&
+ cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
+ &cached_size) == 0 &&
+ cached_data && cached_size > 0) {
+ xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+ HTML_PARSE_NOWARNING);
+ if (doc) {
+ job->results_count = job->engine->parser(
+ job->engine->name, doc, job->out_results, job->max_results);
+ xmlFreeDoc(doc);
+ cache_hit = 1;
+ }
+ free(cached_data);
+ }
+
+ if (cache_hit) {
+ free(encoded_query);
+ job->results_count = job->results_count > 0 ? job->results_count : 0;
+ continue;
+ }
+
+ job->handle = curl_easy_init();
+ if (!job->handle) {
+ free(encoded_query);
+ continue;
+ }
+
+ job->response.memory = (char *)malloc(16384);
+ job->response.size = 0;
+ job->response.capacity = 16384;
struct curl_slist *headers = NULL;
char host_buf[256], ref_buf[256];
@@ -451,6 +489,13 @@ retry:
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
if (msg->data.result == CURLE_OK && job->response.size > 0) {
+ char *key =
+ cache_compute_key(job->query, job->page, job->engine->name);
+ if (key && get_cache_ttl_search() > 0) {
+ cache_set(key, job->response.memory, job->response.size);
+ free(key);
+ }
+
xmlDocPtr doc = htmlReadMemory(
job->response.memory, job->response.size, NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);