aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Config.c9
-rw-r--r--src/Config.h3
-rw-r--r--src/Infobox/Dictionary.c49
-rw-r--r--src/Infobox/Wikipedia.c35
-rw-r--r--src/Main.c20
-rw-r--r--src/Scraping/Scraping.c73
6 files changed, 173 insertions, 16 deletions
diff --git a/src/Config.c b/src/Config.c
index ff57dd7..d038525 100644
--- a/src/Config.c
+++ b/src/Config.c
@@ -80,6 +80,15 @@ int load_config(const char *filename, Config *config) {
} else if (strcmp(key, "randomize_password") == 0) {
config->randomize_password = atoi(value);
}
+ } else if (strcmp(section, "cache") == 0) {
+ if (strcmp(key, "dir") == 0) {
+ strncpy(config->cache_dir, value, sizeof(config->cache_dir) - 1);
+ config->cache_dir[sizeof(config->cache_dir) - 1] = '\0';
+ } else if (strcmp(key, "ttl_search") == 0) {
+ config->cache_ttl_search = atoi(value);
+ } else if (strcmp(key, "ttl_infobox") == 0) {
+ config->cache_ttl_infobox = atoi(value);
+ }
}
}
}
diff --git a/src/Config.h b/src/Config.h
index 17abd74..3571018 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -9,6 +9,9 @@ typedef struct {
int max_proxy_retries;
int randomize_username;
int randomize_password;
+ char cache_dir[512];
+ int cache_ttl_search;
+ int cache_ttl_infobox;
} Config;
int load_config(const char *filename, Config *config);
diff --git a/src/Infobox/Dictionary.c b/src/Infobox/Dictionary.c
index 053b0f2..768c2c6 100644
--- a/src/Infobox/Dictionary.c
+++ b/src/Infobox/Dictionary.c
@@ -1,4 +1,5 @@
#include "Dictionary.h"
+#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Scraping/Scraping.h"
#include <ctype.h>
@@ -266,6 +267,48 @@ InfoBox fetch_dictionary_data(const char *query) {
if (!url)
return info;
+ char *cache_key = cache_compute_key(url, 0, "dictionary");
+ if (cache_key && get_cache_ttl_infobox() > 0) {
+ char *cached_data = NULL;
+ size_t cached_size = 0;
+ if (cache_get(cache_key, (time_t)get_cache_ttl_infobox(), &cached_data,
+ &cached_size) == 0 &&
+ cached_data && cached_size > 0) {
+ htmlDocPtr doc = htmlReadMemory(cached_data, cached_size, url, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+ HTML_PARSE_NOWARNING);
+ if (doc) {
+ char *word = xpath_text(doc, "//span[@class='hw dhw']");
+ char *pron = xpath_text(
+ doc,
+ "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']");
+ char *pos = xpath_text(doc, "//span[@class='pos dpos']");
+ char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]");
+ char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]");
+
+ if (word && def) {
+ info.title = strdup("Dictionary");
+ info.extract = build_html(word, pron, pos, def, ex);
+ info.thumbnail_url = strdup("/static/dictionary.jpg");
+ info.url = strdup(url);
+ }
+
+ free(word);
+ free(pron);
+ free(pos);
+ free(def);
+ free(ex);
+ xmlFreeDoc(doc);
+ }
+ free(cached_data);
+ free(cache_key);
+ free(url);
+ return info;
+ }
+ free(cached_data);
+ }
+ free(cache_key);
+
CURL *curl = curl_easy_init();
if (!curl) {
free(url);
@@ -281,6 +324,12 @@ InfoBox fetch_dictionary_data(const char *query) {
apply_proxy_settings(curl);
if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
+ cache_key = cache_compute_key(url, 0, "dictionary");
+ if (cache_key && get_cache_ttl_infobox() > 0) {
+ cache_set(cache_key, chunk.memory, chunk.size);
+ }
+ free(cache_key);
+
htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING);
diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c
index ffcc75b..d22fd89 100644
--- a/src/Infobox/Wikipedia.c
+++ b/src/Infobox/Wikipedia.c
@@ -1,4 +1,5 @@
#include "Wikipedia.h"
+#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Scraping/Scraping.h"
#include <curl/curl.h>
@@ -117,6 +118,32 @@ InfoBox fetch_wiki_data(char *api_url) {
struct WikiMemoryStruct chunk;
InfoBox info = {NULL, NULL, NULL, NULL};
+ if (!api_url) {
+ return info;
+ }
+
+ char *cache_key = cache_compute_key(api_url, 0, "wikipedia");
+ if (cache_key && get_cache_ttl_infobox() > 0) {
+ char *cached_data = NULL;
+ size_t cached_size = 0;
+ if (cache_get(cache_key, get_cache_ttl_infobox(), &cached_data,
+ &cached_size) == 0 &&
+ cached_data && cached_size > 0) {
+ xmlDocPtr doc =
+ xmlReadMemory(cached_data, cached_size, "noname.xml", NULL, 0);
+ if (doc != NULL) {
+ xmlNode *root_element = xmlDocGetRootElement(doc);
+ extract_wiki_info(root_element, &info);
+ xmlFreeDoc(doc);
+ }
+ free(cached_data);
+ free(cache_key);
+ return info;
+ }
+ free(cached_data);
+ }
+ free(cache_key);
+
chunk.memory = malloc(1);
chunk.size = 0;
@@ -132,7 +159,13 @@ InfoBox fetch_wiki_data(char *api_url) {
res = curl_easy_perform(curl_handle);
- if (res == CURLE_OK) {
+ if (res == CURLE_OK && chunk.size > 0) {
+ cache_key = cache_compute_key(api_url, 0, "wikipedia");
+ if (cache_key && get_cache_ttl_infobox() > 0) {
+ cache_set(cache_key, chunk.memory, chunk.size);
+ }
+ free(cache_key);
+
xmlDocPtr doc =
xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
if (doc != NULL) {
diff --git a/src/Main.c b/src/Main.c
index eef0aef..c76764a 100644
--- a/src/Main.c
+++ b/src/Main.c
@@ -5,7 +5,9 @@
#include <stdio.h>
#include <stdlib.h>
+#include "Cache/Cache.h"
#include "Config.h"
+#include "Infobox/Wikipedia.h"
#include "Proxy/Proxy.h"
#include "Routes/Home.h"
#include "Routes/ImageProxy.h"
@@ -37,12 +39,27 @@ int main() {
.proxy_list_file = "",
.max_proxy_retries = 3,
.randomize_username = 0,
- .randomize_password = 0};
+ .randomize_password = 0,
+ .cache_dir = "/tmp/omnisearch_cache",
+ .cache_ttl_search = 3600,
+ .cache_ttl_infobox = 86400};
if (load_config("config.ini", &config) != 0) {
fprintf(stderr, "Warning: Could not load config file, using defaults\n");
}
+ if (cache_init(config.cache_dir) != 0) {
+ fprintf(
+ stderr,
+ "Warning: Failed to initialize cache, continuing without caching\n");
+ } else {
+ fprintf(stderr, "Cache initialized at %s\n", config.cache_dir);
+ cache_cleanup(config.cache_ttl_search);
+ }
+
+ set_cache_ttl_search(config.cache_ttl_search);
+ set_cache_ttl_infobox(config.cache_ttl_infobox);
+
if (config.proxy_list_file[0] != '\0') {
if (load_proxy_list(config.proxy_list_file) < 0) {
fprintf(
@@ -82,5 +99,6 @@ int main() {
curl_global_cleanup();
xmlCleanupParser();
free_proxy_list();
+ cache_shutdown();
return EXIT_SUCCESS;
}
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index a9ad913..7ba2d97 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -1,4 +1,5 @@
#include "Scraping.h"
+#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Utility/Unescape.h"
#include <curl/curl.h>
@@ -368,6 +369,10 @@ retry:
for (int i = 0; i < num_jobs; i++) {
ScrapeJob *job = &jobs[i];
+ char cache_key[64];
+ char full_url[1024];
+ char *encoded_query = NULL;
+
if (job->handle) {
curl_easy_cleanup(job->handle);
job->handle = NULL;
@@ -376,20 +381,8 @@ retry:
free(job->response.memory);
}
- job->handle = curl_easy_init();
- if (!job->handle) {
- continue;
- }
-
- job->response.memory = (char *)malloc(16384);
- job->response.size = 0;
- job->response.capacity = 16384;
-
- char full_url[1024];
- char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
+ encoded_query = curl_easy_escape(NULL, job->query, 0);
if (!encoded_query) {
- curl_easy_cleanup(job->handle);
- job->handle = NULL;
continue;
}
@@ -399,7 +392,52 @@ retry:
snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
encoded_query, job->engine->page_param, page_value);
- curl_free(encoded_query);
+
+ char *key = cache_compute_key(job->query, job->page, job->engine->name);
+ if (key) {
+ strncpy(cache_key, key, sizeof(cache_key) - 1);
+ cache_key[sizeof(cache_key) - 1] = '\0';
+ free(key);
+ } else {
+ snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
+ job->engine->name);
+ }
+
+ char *cached_data = NULL;
+ size_t cached_size = 0;
+ int cache_hit = 0;
+
+ if (get_cache_ttl_search() > 0 &&
+ cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
+ &cached_size) == 0 &&
+ cached_data && cached_size > 0) {
+ xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+ HTML_PARSE_NOWARNING);
+ if (doc) {
+ job->results_count = job->engine->parser(
+ job->engine->name, doc, job->out_results, job->max_results);
+ xmlFreeDoc(doc);
+ cache_hit = 1;
+ }
+ free(cached_data);
+ }
+
+ if (cache_hit) {
+ free(encoded_query);
+ job->results_count = job->results_count > 0 ? job->results_count : 0;
+ continue;
+ }
+
+ job->handle = curl_easy_init();
+ if (!job->handle) {
+ free(encoded_query);
+ continue;
+ }
+
+ job->response.memory = (char *)malloc(16384);
+ job->response.size = 0;
+ job->response.capacity = 16384;
struct curl_slist *headers = NULL;
char host_buf[256], ref_buf[256];
@@ -451,6 +489,13 @@ retry:
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
if (msg->data.result == CURLE_OK && job->response.size > 0) {
+ char *key =
+ cache_compute_key(job->query, job->page, job->engine->name);
+ if (key && get_cache_ttl_search() > 0) {
+ cache_set(key, job->response.memory, job->response.size);
+ free(key);
+ }
+
xmlDocPtr doc = htmlReadMemory(
job->response.memory, job->response.size, NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);