From e33310f26351e25fda718a353f8b8e9ece0007b5 Mon Sep 17 00:00:00 2001 From: frosty Date: Tue, 10 Mar 2026 03:40:34 -0400 Subject: feature: added caching --- Makefile | 2 +- example-config.ini | 12 +++++++- src/Config.c | 9 ++++++ src/Config.h | 3 ++ src/Infobox/Dictionary.c | 49 ++++++++++++++++++++++++++++++++ src/Infobox/Wikipedia.c | 35 ++++++++++++++++++++++- src/Main.c | 20 ++++++++++++- src/Scraping/Scraping.c | 73 ++++++++++++++++++++++++++++++++++++++---------- 8 files changed, 185 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index a598e0b..e4fc322 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ else LDFLAGS := endif -LIBS := -lbeaker -lcurl -lxml2 -lpthread -lm +LIBS := -lbeaker -lcurl -lxml2 -lpthread -lm -lssl -lcrypto SRC_DIR := src BIN_DIR := bin diff --git a/example-config.ini b/example-config.ini index 26385ac..dbc4f6b 100644 --- a/example-config.ini +++ b/example-config.ini @@ -1,7 +1,7 @@ [server] host = 0.0.0.0 port = 8000 - + [proxy] # Single proxy (comment out to use list_file instead) #proxy = "socks5://127.0.0.1:9050" @@ -14,3 +14,13 @@ port = 8000 # Randomize proxy credentials for each request #randomize_username = true #randomize_password = true + +[cache] +# Directory to store cached responses +#dir = /tmp/omnisearch_cache + +# Cache TTL for search results in seconds (default: 3600 = 1 hour) +#ttl_search = 3600 + +# Cache TTL for infobox data in seconds (default: 86400 = 24 hours) +#ttl_infobox = 86400 diff --git a/src/Config.c b/src/Config.c index ff57dd7..d038525 100644 --- a/src/Config.c +++ b/src/Config.c @@ -80,6 +80,15 @@ int load_config(const char *filename, Config *config) { } else if (strcmp(key, "randomize_password") == 0) { config->randomize_password = atoi(value); } + } else if (strcmp(section, "cache") == 0) { + if (strcmp(key, "dir") == 0) { + strncpy(config->cache_dir, value, sizeof(config->cache_dir) - 1); + config->cache_dir[sizeof(config->cache_dir) - 1] = '\0'; + } else if (strcmp(key, "ttl_search") == 0) { + config->cache_ttl_search = atoi(value); + } else if (strcmp(key, "ttl_infobox") == 0) { + config->cache_ttl_infobox = atoi(value); + } } } } diff --git a/src/Config.h b/src/Config.h index 17abd74..3571018 100644 --- a/src/Config.h +++ b/src/Config.h @@ -9,6 +9,9 @@ typedef struct { int max_proxy_retries; int randomize_username; int randomize_password; + char cache_dir[512]; + int cache_ttl_search; + int cache_ttl_infobox; } Config; int load_config(const char *filename, Config *config); diff --git a/src/Infobox/Dictionary.c b/src/Infobox/Dictionary.c index 053b0f2..768c2c6 100644 --- a/src/Infobox/Dictionary.c +++ b/src/Infobox/Dictionary.c @@ -1,4 +1,5 @@ #include "Dictionary.h" +#include "../Cache/Cache.h" #include "../Proxy/Proxy.h" #include "../Scraping/Scraping.h" #include @@ -266,6 +267,48 @@ InfoBox fetch_dictionary_data(const char *query) { if (!url) return info; + char *cache_key = cache_compute_key(url, 0, "dictionary"); + if (cache_key && get_cache_ttl_infobox() > 0) { + char *cached_data = NULL; + size_t cached_size = 0; + if (cache_get(cache_key, (time_t)get_cache_ttl_infobox(), &cached_data, + &cached_size) == 0 && + cached_data && cached_size > 0) { + htmlDocPtr doc = htmlReadMemory(cached_data, cached_size, url, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING); + if (doc) { + char *word = xpath_text(doc, "//span[@class='hw dhw']"); + char *pron = xpath_text( + doc, + "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']"); + char *pos = xpath_text(doc, "//span[@class='pos dpos']"); + char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]"); + char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]"); + + if (word && def) { + info.title = strdup("Dictionary"); + info.extract = build_html(word, pron, pos, def, ex); + info.thumbnail_url = strdup("/static/dictionary.jpg"); + info.url = strdup(url); + } + + free(word); + free(pron); + free(pos); + free(def); + free(ex); + xmlFreeDoc(doc); + } + free(cached_data); + free(cache_key); + free(url); + return info; + } + free(cached_data); + } + free(cache_key); + CURL *curl = curl_easy_init(); if (!curl) { free(url); @@ -281,6 +324,12 @@ InfoBox fetch_dictionary_data(const char *query) { apply_proxy_settings(curl); if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) { + cache_key = cache_compute_key(url, 0, "dictionary"); + if (cache_key && get_cache_ttl_infobox() > 0) { + cache_set(cache_key, chunk.memory, chunk.size); + } + free(cache_key); + htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c index ffcc75b..d22fd89 100644 --- a/src/Infobox/Wikipedia.c +++ b/src/Infobox/Wikipedia.c @@ -1,4 +1,5 @@ #include "Wikipedia.h" +#include "../Cache/Cache.h" #include "../Proxy/Proxy.h" #include "../Scraping/Scraping.h" #include @@ -117,6 +118,32 @@ InfoBox fetch_wiki_data(char *api_url) { struct WikiMemoryStruct chunk; InfoBox info = {NULL, NULL, NULL, NULL}; + if (!api_url) { + return info; + } + + char *cache_key = cache_compute_key(api_url, 0, "wikipedia"); + if (cache_key && get_cache_ttl_infobox() > 0) { + char *cached_data = NULL; + size_t cached_size = 0; + if (cache_get(cache_key, get_cache_ttl_infobox(), &cached_data, + &cached_size) == 0 && + cached_data && cached_size > 0) { + xmlDocPtr doc = + xmlReadMemory(cached_data, cached_size, "noname.xml", NULL, 0); + if (doc != NULL) { + xmlNode *root_element = xmlDocGetRootElement(doc); + extract_wiki_info(root_element, &info); + xmlFreeDoc(doc); + } + free(cached_data); + free(cache_key); + return info; + } + free(cached_data); + } + free(cache_key); + chunk.memory = malloc(1); chunk.size = 0; @@ -132,7 +159,13 @@ InfoBox fetch_wiki_data(char *api_url) { res = curl_easy_perform(curl_handle); - if (res == CURLE_OK) { + if (res == CURLE_OK && chunk.size > 0) { + cache_key = cache_compute_key(api_url, 0, "wikipedia"); + if (cache_key && get_cache_ttl_infobox() > 0) { + cache_set(cache_key, chunk.memory, chunk.size); + } + free(cache_key); + xmlDocPtr doc = xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0); if (doc != NULL) { diff --git a/src/Main.c b/src/Main.c index eef0aef..c76764a 100644 --- a/src/Main.c +++ b/src/Main.c @@ -5,7 +5,9 @@ #include #include +#include "Cache/Cache.h" #include "Config.h" +#include "Infobox/Wikipedia.h" #include "Proxy/Proxy.h" #include "Routes/Home.h" #include "Routes/ImageProxy.h" @@ -37,12 +39,27 @@ int main() { .proxy_list_file = "", .max_proxy_retries = 3, .randomize_username = 0, - .randomize_password = 0}; + .randomize_password = 0, + .cache_dir = "/tmp/omnisearch_cache", + .cache_ttl_search = 3600, + .cache_ttl_infobox = 86400}; if (load_config("config.ini", &config) != 0) { fprintf(stderr, "Warning: Could not load config file, using defaults\n"); } + if (cache_init(config.cache_dir) != 0) { + fprintf( + stderr, + "Warning: Failed to initialize cache, continuing without caching\n"); + } else { + fprintf(stderr, "Cache initialized at %s\n", config.cache_dir); + cache_cleanup(config.cache_ttl_search); + } + + set_cache_ttl_search(config.cache_ttl_search); + set_cache_ttl_infobox(config.cache_ttl_infobox); + if (config.proxy_list_file[0] != '\0') { if (load_proxy_list(config.proxy_list_file) < 0) { fprintf( @@ -82,5 +99,6 @@ int main() { curl_global_cleanup(); xmlCleanupParser(); free_proxy_list(); + cache_shutdown(); return EXIT_SUCCESS; } diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index a9ad913..7ba2d97 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -1,4 +1,5 @@ #include "Scraping.h" +#include "../Cache/Cache.h" #include "../Proxy/Proxy.h" #include "../Utility/Unescape.h" #include @@ -368,6 +369,10 @@ retry: for (int i = 0; i < num_jobs; i++) { ScrapeJob *job = &jobs[i]; + char cache_key[64]; + char full_url[1024]; + char *encoded_query = NULL; + if (job->handle) { curl_easy_cleanup(job->handle); job->handle = NULL; @@ -376,20 +381,8 @@ retry: free(job->response.memory); } - job->handle = curl_easy_init(); - if (!job->handle) { - continue; - } - - job->response.memory = (char *)malloc(16384); - job->response.size = 0; - job->response.capacity = 16384; - - char full_url[1024]; - char *encoded_query = curl_easy_escape(job->handle, job->query, 0); + encoded_query = curl_easy_escape(NULL, job->query, 0); if (!encoded_query) { - curl_easy_cleanup(job->handle); - job->handle = NULL; continue; } @@ -399,7 +392,52 @@ retry: snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url, encoded_query, job->engine->page_param, page_value); - curl_free(encoded_query); + + char *key = cache_compute_key(job->query, job->page, job->engine->name); + if (key) { + strncpy(cache_key, key, sizeof(cache_key) - 1); + cache_key[sizeof(cache_key) - 1] = '\0'; + free(key); + } else { + snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i, + job->engine->name); + } + + char *cached_data = NULL; + size_t cached_size = 0; + int cache_hit = 0; + + if (get_cache_ttl_search() > 0 && + cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data, + &cached_size) == 0 && + cached_data && cached_size > 0) { + xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING); + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); + cache_hit = 1; + } + free(cached_data); + } + + if (cache_hit) { + free(encoded_query); + job->results_count = job->results_count > 0 ? job->results_count : 0; + continue; + } + + job->handle = curl_easy_init(); + if (!job->handle) { + free(encoded_query); + continue; + } + + job->response.memory = (char *)malloc(16384); + job->response.size = 0; + job->response.capacity = 16384; struct curl_slist *headers = NULL; char host_buf[256], ref_buf[256]; @@ -451,6 +489,13 @@ retry: curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code); if (msg->data.result == CURLE_OK && job->response.size > 0) { + char *key = + cache_compute_key(job->query, job->page, job->engine->name); + if (key && get_cache_ttl_search() > 0) { + cache_set(key, job->response.memory, job->response.size); + free(key); + } + xmlDocPtr doc = htmlReadMemory( job->response.memory, job->response.size, NULL, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); -- cgit v1.2.3