diff options
| -rw-r--r-- | src/Infobox/Calculator.c | 168 | ||||
| -rw-r--r-- | src/Infobox/Dictionary.c | 376 | ||||
| -rw-r--r-- | src/Infobox/Infobox.c | 16 | ||||
| -rw-r--r-- | src/Infobox/Infobox.h | 8 | ||||
| -rw-r--r-- | src/Infobox/UnitConversion.c | 394 | ||||
| -rw-r--r-- | src/Infobox/Wikipedia.c | 144 | ||||
| -rw-r--r-- | src/Main.c | 40 | ||||
| -rw-r--r-- | src/Proxy/Proxy.c | 240 | ||||
| -rw-r--r-- | src/Routes/Home.c | 14 | ||||
| -rw-r--r-- | src/Routes/ImageProxy.c | 88 | ||||
| -rw-r--r-- | src/Routes/Images.c | 302 | ||||
| -rw-r--r-- | src/Routes/Search.c | 336 | ||||
| -rw-r--r-- | src/Scraping/Scraping.c | 570 | ||||
| -rw-r--r-- | src/Scraping/Scraping.h | 10 | ||||
| -rw-r--r-- | src/Utility/Display.c | 72 | ||||
| -rw-r--r-- | src/Utility/Utility.c | 8 |
16 files changed, 1393 insertions, 1393 deletions
diff --git a/src/Infobox/Calculator.c b/src/Infobox/Calculator.c index b80ce21..22563f7 100644 --- a/src/Infobox/Calculator.c +++ b/src/Infobox/Calculator.c @@ -8,108 +8,108 @@ static char logic_log[4096]; typedef struct { - const char *buffer; - int pos; + const char *buffer; + int pos; } Parser; static double parse_expression(Parser *p); static void skip_ws(Parser *p) { - while (p->buffer[p->pos] == ' ') p->pos++; + while (p->buffer[p->pos] == ' ') p->pos++; } static double parse_factor(Parser *p) { - skip_ws(p); - if (p->buffer[p->pos] == '-') { - p->pos++; - return -parse_factor(p); - } - if (p->buffer[p->pos] == '(') { - p->pos++; - double res = parse_expression(p); - if (p->buffer[p->pos] == ')') p->pos++; - return res; - } - char *endptr; - double val = strtod(&p->buffer[p->pos], &endptr); - p->pos = (int)(endptr - p->buffer); - return val; + skip_ws(p); + if (p->buffer[p->pos] == '-') { + p->pos++; + return -parse_factor(p); + } + if (p->buffer[p->pos] == '(') { + p->pos++; + double res = parse_expression(p); + if (p->buffer[p->pos] == ')') p->pos++; + return res; + } + char *endptr; + double val = strtod(&p->buffer[p->pos], &endptr); + p->pos = (int)(endptr - p->buffer); + return val; } static double parse_term(Parser *p) { - double left = parse_factor(p); - while (1) { - skip_ws(p); - char op = p->buffer[p->pos]; - if (op == '*' || op == '/') { - p->pos++; - double right = parse_factor(p); - double old = left; - left = (op == '*') ? left * right : left / right; - - char step[256]; - - snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op, - right, left); - strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1); - } else - break; - } - return left; + double left = parse_factor(p); + while (1) { + skip_ws(p); + char op = p->buffer[p->pos]; + if (op == '*' || op == '/') { + p->pos++; + double right = parse_factor(p); + double old = left; + left = (op == '*') ? left * right : left / right; + + char step[256]; + + snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op, + right, left); + strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1); + } else + break; + } + return left; } static double parse_expression(Parser *p) { - double left = parse_term(p); - while (1) { - skip_ws(p); - char op = p->buffer[p->pos]; - if (op == '+' || op == '-') { - p->pos++; - double right = parse_term(p); - double old = left; - left = (op == '+') ? left + right : left - right; - - char step[256]; - - snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op, - right, left); - strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1); - } else - break; - } - return left; + double left = parse_term(p); + while (1) { + skip_ws(p); + char op = p->buffer[p->pos]; + if (op == '+' || op == '-') { + p->pos++; + double right = parse_term(p); + double old = left; + left = (op == '+') ? left + right : left - right; + + char step[256]; + + snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op, + right, left); + strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1); + } else + break; + } + return left; } double evaluate(const char *expr) { - logic_log[0] = '\0'; - if (!expr || strlen(expr) == 0) return 0.0; - Parser p = {expr, 0}; - return parse_expression(&p); + logic_log[0] = '\0'; + if (!expr || strlen(expr) == 0) return 0.0; + Parser p = {expr, 0}; + return parse_expression(&p); } InfoBox fetch_calc_data(char *math_input) { - InfoBox info = {NULL, NULL, NULL, NULL}; - if (!math_input) return info; - - double result = evaluate(math_input); - - char html_output[5120]; - snprintf(html_output, sizeof(html_output), - "<div class='calc-container' style='line-height: 1.6;'>" - "%s" - "<div style='margin-top: 8px; border-top: 1px solid #eee; " - "padding-top: 8px; font-size: 1.2em;'>" - "<b>%g</b>" - "</div>" - "</div>", - strlen(logic_log) > 0 ? logic_log : "<div>Constant value</div>", - result); - - info.title = strdup("Calculation"); - info.extract = strdup(html_output); - info.thumbnail_url = - strdup("/static/calculation.svg"); - info.url = strdup("#"); - - return info; + InfoBox info = {NULL, NULL, NULL, NULL}; + if (!math_input) return info; + + double result = evaluate(math_input); + + char html_output[5120]; + snprintf(html_output, sizeof(html_output), + "<div class='calc-container' style='line-height: 1.6;'>" + "%s" + "<div style='margin-top: 8px; border-top: 1px solid #eee; " + "padding-top: 8px; font-size: 1.2em;'>" + "<b>%g</b>" + "</div>" + "</div>", + strlen(logic_log) > 0 ? logic_log : "<div>Constant value</div>", + result); + + info.title = strdup("Calculation"); + info.extract = strdup(html_output); + info.thumbnail_url = + strdup("/static/calculation.svg"); + info.url = strdup("#"); + + return info; } diff --git a/src/Infobox/Dictionary.c b/src/Infobox/Dictionary.c index ca4e5cd..1900c21 100644 --- a/src/Infobox/Dictionary.c +++ b/src/Infobox/Dictionary.c @@ -11,239 +11,239 @@ #include <ctype.h> static const char *PREFIXES[] = { - "what is the definition of ", "what's the definition of ", - "what is the meaning of ", "what's the meaning of ", - "what does the word ", "definition of ", "meaning of ", "def of ", - "define ", "definition ", "define:", "def ", "def:", - "what does ", "what is ", "what's ", "whats ", - "meaning ", "dictionary ", "dict ", NULL + "what is the definition of ", "what's the definition of ", + "what is the meaning of ", "what's the meaning of ", + "what does the word ", "definition of ", "meaning of ", "def of ", + "define ", "definition ", "define:", "def ", "def:", + "what does ", "what is ", "what's ", "whats ", + "meaning ", "dictionary ", "dict ", NULL }; static const char *SUFFIXES[] = { - " definition", " def", " meaning", " mean", " means", - " dictionary", " dict", " define", " defined", - " definition?", " def?", " meaning?", " mean?", " means?", - " in english", " in english?", NULL + " definition", " def", " meaning", " mean", " means", + " dictionary", " dict", " define", " defined", + " definition?", " def?", " meaning?", " mean?", " means?", + " in english", " in english?", NULL }; static const char *SKIP_WORDS[] = {"of ", "the ", "a ", "an ", NULL}; static const char *strcasestr_impl(const char *haystack, const char *needle) { - if (!haystack || !needle || !*needle) return haystack; - size_t len = strlen(needle); - for (const char *h = haystack; *h; h++) { - if (strncasecmp(h, needle, len) == 0) return h; - } - return NULL; + if (!haystack || !needle || !*needle) return haystack; + size_t len = strlen(needle); + for (const char *h = haystack; *h; h++) { + if (strncasecmp(h, needle, len) == 0) return h; + } + return NULL; } struct MemStruct { char *memory; size_t size; }; static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp) { - size_t realsize = size * nmemb; - struct MemStruct *mem = (struct MemStruct *)userp; - char *ptr = realloc(mem->memory, mem->size + realsize + 1); - if (!ptr) return 0; - mem->memory = ptr; - memcpy(&(mem->memory[mem->size]), contents, realsize); - mem->size += realsize; - mem->memory[mem->size] = 0; - return realsize; + size_t realsize = size * nmemb; + struct MemStruct *mem = (struct MemStruct *)userp; + char *ptr = realloc(mem->memory, mem->size + realsize + 1); + if (!ptr) return 0; + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + return realsize; } static char *xpath_text(xmlDocPtr doc, const char *xpath) { - xmlXPathContextPtr ctx = xmlXPathNewContext(doc); - if (!ctx) return NULL; - xmlXPathObjectPtr obj = xmlXPathEvalExpression((const xmlChar *)xpath, ctx); - xmlXPathFreeContext(ctx); - if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { - if (obj) xmlXPathFreeObject(obj); - return NULL; - } - xmlChar *content = xmlNodeGetContent(obj->nodesetval->nodeTab[0]); - char *result = content ? strdup((char *)content) : NULL; - if (content) xmlFree(content); - xmlXPathFreeObject(obj); - return result; + xmlXPathContextPtr ctx = xmlXPathNewContext(doc); + if (!ctx) return NULL; + xmlXPathObjectPtr obj = xmlXPathEvalExpression((const xmlChar *)xpath, ctx); + xmlXPathFreeContext(ctx); + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + if (obj) xmlXPathFreeObject(obj); + return NULL; + } + xmlChar *content = xmlNodeGetContent(obj->nodesetval->nodeTab[0]); + char *result = content ? strdup((char *)content) : NULL; + if (content) xmlFree(content); + xmlXPathFreeObject(obj); + return result; } static char *build_html(const char *word, const char *pron, const char *pos, - const char *def, const char *ex) { - char html[4096]; - int n = snprintf(html, sizeof(html), "<div class='dict-container' style='line-height: 1.6;'>"); - if (word) n += snprintf(html + n, sizeof(html) - n, - "<div style='font-size: 1.3em; font-weight: bold; margin-bottom: 4px;'>%s</div>", word); - if (pron) n += snprintf(html + n, sizeof(html) - n, - "<div style='color: #666; margin-bottom: 8px;'>/%s/</div>", pron); - if (pos) n += snprintf(html + n, sizeof(html) - n, - "<div style='font-style: italic; color: #888; margin-bottom: 8px;'>%s</div>", pos); - if (def) n += snprintf(html + n, sizeof(html) - n, - "<div style='margin-bottom: 8px;'>%s</div>", def); - if (ex) n += snprintf(html + n, sizeof(html) - n, - "<div style='color: #555; font-style: italic; margin-top: 8px;'>\"%s\"</div>", ex); - snprintf(html + n, sizeof(html) - n, "</div>"); - return strdup(html); + const char *def, const char *ex) { + char html[4096]; + int n = snprintf(html, sizeof(html), "<div class='dict-container' style='line-height: 1.6;'>"); + if (word) n += snprintf(html + n, sizeof(html) - n, + "<div style='font-size: 1.3em; font-weight: bold; margin-bottom: 4px;'>%s</div>", word); + if (pron) n += snprintf(html + n, sizeof(html) - n, + "<div style='color: #666; margin-bottom: 8px;'>/%s/</div>", pron); + if (pos) n += snprintf(html + n, sizeof(html) - n, + "<div style='font-style: italic; color: #888; margin-bottom: 8px;'>%s</div>", pos); + if (def) n += snprintf(html + n, sizeof(html) - n, + "<div style='margin-bottom: 8px;'>%s</div>", def); + if (ex) n += snprintf(html + n, sizeof(html) - n, + "<div style='color: #555; font-style: italic; margin-top: 8px;'>\"%s\"</div>", ex); + snprintf(html + n, sizeof(html) - n, "</div>"); + return strdup(html); } static char *extract_word(const char *query) { - if (!query) return NULL; + if (!query) return NULL; - const char *start = query; + const char *start = query; - for (int i = 0; PREFIXES[i]; i++) { - size_t len = strlen(PREFIXES[i]); - if (strncasecmp(start, PREFIXES[i], len) == 0) { - start += len; - break; - } + for (int i = 0; PREFIXES[i]; i++) { + size_t len = strlen(PREFIXES[i]); + if (strncasecmp(start, PREFIXES[i], len) == 0) { + start += len; + break; } - - while (*start == ' ') start++; - char *word = strdup(start); - if (!word) return NULL; - - int changed = 1; - while (changed) { - changed = 0; - for (int i = 0; SKIP_WORDS[i]; i++) { - size_t len = strlen(SKIP_WORDS[i]); - if (strncasecmp(word, SKIP_WORDS[i], len) == 0) { - memmove(word, word + len, strlen(word + len) + 1); - changed = 1; - break; - } - } + } + + while (*start == ' ') start++; + char *word = strdup(start); + if (!word) return NULL; + + int changed = 1; + while (changed) { + changed = 0; + for (int i = 0; SKIP_WORDS[i]; i++) { + size_t len = strlen(SKIP_WORDS[i]); + if (strncasecmp(word, SKIP_WORDS[i], len) == 0) { + memmove(word, word + len, strlen(word + len) + 1); + changed = 1; + break; + } } + } - changed = 1; - while (changed) { - changed = 0; - for (int i = 0; SUFFIXES[i]; i++) { - const char *found = strcasestr_impl(word, SUFFIXES[i]); - if (found) { - char *pos = word + (found - word); - *pos = '\0'; - changed = 1; - break; - } - } + changed = 1; + while (changed) { + changed = 0; + for (int i = 0; SUFFIXES[i]; i++) { + const char *found = strcasestr_impl(word, SUFFIXES[i]); + if (found) { + char *pos = word + (found - word); + *pos = '\0'; + changed = 1; + break; + } } + } - size_t len = strlen(word); - while (len > 0 && (word[len-1] == ' ' || word[len-1] == '?' || - word[len-1] == '!' || word[len-1] == '.')) { - word[--len] = '\0'; - } + size_t len = strlen(word); + while (len > 0 && (word[len-1] == ' ' || word[len-1] == '?' || + word[len-1] == '!' || word[len-1] == '.')) { + word[--len] = '\0'; + } - if (len == 0) { free(word); return NULL; } + if (len == 0) { free(word); return NULL; } - for (size_t i = 0; i < len; i++) word[i] = tolower((unsigned char)word[i]); - char *space = strchr(word, ' '); - if (space) *space = '\0'; + for (size_t i = 0; i < len; i++) word[i] = tolower((unsigned char)word[i]); + char *space = strchr(word, ' '); + if (space) *space = '\0'; - return word; + return word; } int is_dictionary_query(const char *query) { - if (!query) return 0; - - for (int i = 0; PREFIXES[i]; i++) { - size_t len = strlen(PREFIXES[i]); - if (strncasecmp(query, PREFIXES[i], len) == 0) { - const char *after = query + len; - while (*after == ' ') after++; - if (*after != '\0') return 1; - } + if (!query) return 0; + + for (int i = 0; PREFIXES[i]; i++) { + size_t len = strlen(PREFIXES[i]); + if (strncasecmp(query, PREFIXES[i], len) == 0) { + const char *after = query + len; + while (*after == ' ') after++; + if (*after != '\0') return 1; } - - for (int i = 0; SUFFIXES[i]; i++) { - const char *pos = strcasestr_impl(query, SUFFIXES[i]); - if (pos) { - const char *after = pos + strlen(SUFFIXES[i]); - while (*after == ' ' || *after == '?' || *after == '!' || *after == '.') after++; - if (*after == '\0' && pos > query && (pos - query) < 100) return 1; - } + } + + for (int i = 0; SUFFIXES[i]; i++) { + const char *pos = strcasestr_impl(query, SUFFIXES[i]); + if (pos) { + const char *after = pos + strlen(SUFFIXES[i]); + while (*after == ' ' || *after == '?' || *after == '!' || *after == '.') after++; + if (*after == '\0' && pos > query && (pos - query) < 100) return 1; } - - if (strncasecmp(query, "what is ", 8) == 0 || - strncasecmp(query, "what's ", 7) == 0 || - strncasecmp(query, "whats ", 6) == 0) { - const char *word = query + (strncasecmp(query, "what is ", 8) == 0 ? 8 : - strncasecmp(query, "what's ", 7) == 0 ? 7 : 6); - const char *articles[] = {"the ", "your ", "my ", "his ", "her ", "their ", - "our ", "this ", "that ", "these ", "those ", "a ", "an ", NULL}; - for (int i = 0; articles[i]; i++) { - if (strncasecmp(word, articles[i], strlen(articles[i])) == 0) return 0; - } - const char *space = strchr(word, ' '); - if (!space || *(space + 1) == '\0' || *(space + 1) == '?') return 1; + } + + if (strncasecmp(query, "what is ", 8) == 0 || + strncasecmp(query, "what's ", 7) == 0 || + strncasecmp(query, "whats ", 6) == 0) { + const char *word = query + (strncasecmp(query, "what is ", 8) == 0 ? 8 : + strncasecmp(query, "what's ", 7) == 0 ? 7 : 6); + const char *articles[] = {"the ", "your ", "my ", "his ", "her ", "their ", + "our ", "this ", "that ", "these ", "those ", "a ", "an ", NULL}; + for (int i = 0; articles[i]; i++) { + if (strncasecmp(word, articles[i], strlen(articles[i])) == 0) return 0; } + const char *space = strchr(word, ' '); + if (!space || *(space + 1) == '\0' || *(space + 1) == '?') return 1; + } - return 0; + return 0; } char *construct_dictionary_url(const char *query) { - char *word = extract_word(query); - if (!word) return NULL; - - CURL *curl = curl_easy_init(); - if (!curl) { free(word); return NULL; } - - char *escaped = curl_easy_escape(curl, word, 0); - const char *base = "https://dictionary.cambridge.org/dictionary/english/"; - char *url = malloc(strlen(base) + strlen(escaped) + 1); - if (url) { - strcpy(url, base); - strcat(url, escaped); - } - - curl_free(escaped); - curl_easy_cleanup(curl); - free(word); - return url; + char *word = extract_word(query); + if (!word) return NULL; + + CURL *curl = curl_easy_init(); + if (!curl) { free(word); return NULL; } + + char *escaped = curl_easy_escape(curl, word, 0); + const char *base = "https://dictionary.cambridge.org/dictionary/english/"; + char *url = malloc(strlen(base) + strlen(escaped) + 1); + if (url) { + strcpy(url, base); + strcat(url, escaped); + } + + curl_free(escaped); + curl_easy_cleanup(curl); + free(word); + return url; } InfoBox fetch_dictionary_data(const char *query) { - InfoBox info = {NULL, NULL, NULL, NULL}; - - char *url = construct_dictionary_url(query); - if (!url) return info; - - CURL *curl = curl_easy_init(); - if (!curl) { free(url); return info; } - - struct MemStruct chunk = {malloc(1), 0}; - curl_easy_setopt(curl, CURLOPT_URL, url); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0"); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - apply_proxy_settings(curl); - - if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) { - htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); - if (doc) { - char *word = xpath_text(doc, "//span[@class='hw dhw']"); - char *pron = xpath_text(doc, "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']"); - char *pos = xpath_text(doc, "//span[@class='pos dpos']"); - char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]"); - char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]"); - - if (word && def) { - info.title = strdup("Dictionary"); - info.extract = build_html(word, pron, pos, def, ex); - info.thumbnail_url = strdup("/static/dictionary.jpg"); - info.url = strdup(url); - } - - free(word); free(pron); free(pos); free(def); free(ex); - xmlFreeDoc(doc); - } + InfoBox info = {NULL, NULL, NULL, NULL}; + + char *url = construct_dictionary_url(query); + if (!url) return info; + + CURL *curl = curl_easy_init(); + if (!curl) { free(url); return info; } + + struct MemStruct chunk = {malloc(1), 0}; + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk); + curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0"); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + apply_proxy_settings(curl); + + if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) { + htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + if (doc) { + char *word = xpath_text(doc, "//span[@class='hw dhw']"); + char *pron = xpath_text(doc, "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']"); + char *pos = xpath_text(doc, "//span[@class='pos dpos']"); + char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]"); + char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]"); + + if (word && def) { + info.title = strdup("Dictionary"); + info.extract = build_html(word, pron, pos, def, ex); + info.thumbnail_url = strdup("/static/dictionary.jpg"); + info.url = strdup(url); + } + + free(word); free(pron); free(pos); free(def); free(ex); + xmlFreeDoc(doc); } + } - curl_easy_cleanup(curl); - free(chunk.memory); - free(url); - return info; + curl_easy_cleanup(curl); + free(chunk.memory); + free(url); + return info; }
\ No newline at end of file diff --git a/src/Infobox/Infobox.c b/src/Infobox/Infobox.c index 5043c05..337b057 100644 --- a/src/Infobox/Infobox.c +++ b/src/Infobox/Infobox.c @@ -2,12 +2,12 @@ #include <stdlib.h> void free_infobox(InfoBox *info) { - if (info->title) - free(info->title); - if (info->thumbnail_url) - free(info->thumbnail_url); - if (info->extract) - free(info->extract); - if (info->url) - free(info->url); + if (info->title) + free(info->title); + if (info->thumbnail_url) + free(info->thumbnail_url); + if (info->extract) + free(info->extract); + if (info->url) + free(info->url); } diff --git a/src/Infobox/Infobox.h b/src/Infobox/Infobox.h index a052b80..ecb16b6 100644 --- a/src/Infobox/Infobox.h +++ b/src/Infobox/Infobox.h @@ -2,10 +2,10 @@ #define INFOBOX_H typedef struct { - char *title; - char *thumbnail_url; - char *extract; - char *url; + char *title; + char *thumbnail_url; + char *extract; + char *url; } InfoBox; void free_infobox(InfoBox *info); diff --git a/src/Infobox/UnitConversion.c b/src/Infobox/UnitConversion.c index 6a08b9c..933dd71 100644 --- a/src/Infobox/UnitConversion.c +++ b/src/Infobox/UnitConversion.c @@ -82,24 +82,24 @@ static const UnitDef *find_unit(const char *str) { size_t j = 0; for (size_t i = 0; i < len && j < 63; i++) { - if ((unsigned char)str[i] == 0xC2 && (unsigned char)str[i+1] == 0xB0) { - i++; - continue; - } - if (str[i] == '^' && i + 1 < len && str[i + 1] == '2') { - normalized[j++] = '2'; - i++; - continue; - } - normalized[j++] = tolower((unsigned char)str[i]); + if ((unsigned char)str[i] == 0xC2 && (unsigned char)str[i+1] == 0xB0) { + i++; + continue; + } + if (str[i] == '^' && i + 1 < len && str[i + 1] == '2') { + normalized[j++] = '2'; + i++; + continue; + } + normalized[j++] = tolower((unsigned char)str[i]); } normalized[j] = '\0'; for (int i = 0; i < UNIT_COUNT; i++) { - if (strcmp(normalized, UNITS[i].name) == 0) return &UNITS[i]; - for (int k = 0; k < 4 && UNITS[i].alias[k]; k++) { - if (strcmp(normalized, UNITS[i].alias[k]) == 0) return &UNITS[i]; - } + if (strcmp(normalized, UNITS[i].name) == 0) return &UNITS[i]; + for (int k = 0; k < 4 && UNITS[i].alias[k]; k++) { + if (strcmp(normalized, UNITS[i].alias[k]) == 0) return &UNITS[i]; + } } return NULL; } @@ -108,37 +108,37 @@ int is_unit_conv_query(const char *query) { if (!query) return 0; const char *patterns[] = { - " to ", " in ", " into ", - " = ", " equals ", " equal ", - " convert ", " conversion ", - " -> ", " → ", - NULL + " to ", " in ", " into ", + " = ", " equals ", " equal ", + " convert ", " conversion ", + " -> ", " → ", + NULL }; int has_pattern = 0; for (int i = 0; patterns[i]; i++) { - if (strstr(query, patterns[i])) { - has_pattern = 1; - break; - } + if (strstr(query, patterns[i])) { + has_pattern = 1; + break; + } } if (!has_pattern) { - const char *last_space = strrchr(query, ' '); - if (last_space) { - const UnitDef *u = find_unit(last_space + 1); - if (u) { - const char *before = query; - while (*before && is_whitespace(*before)) before++; - const char *num_end = before; - while (*num_end && - (isdigit(*num_end) || *num_end == '.' || *num_end == '-' || - *num_end == '+' || *num_end == '/' || *num_end == '\'' || *num_end == '"')) { - num_end++; - } - if (num_end > before) has_pattern = 1; - } + const char *last_space = strrchr(query, ' '); + if (last_space) { + const UnitDef *u = find_unit(last_space + 1); + if (u) { + const char *before = query; + while (*before && is_whitespace(*before)) before++; + const char *num_end = before; + while (*num_end && + (isdigit(*num_end) || *num_end == '.' || *num_end == '-' || + *num_end == '+' || *num_end == '/' || *num_end == '\'' || *num_end == '"')) { + num_end++; } + if (num_end > before) has_pattern = 1; + } + } } return has_pattern; @@ -153,58 +153,58 @@ static double parse_value(const char **ptr) { if (*p == '-' || *p == '+') p++; while (*p >= '0' && *p <= '9') { - value = value * 10 + (*p - '0'); - has_num = 1; - p++; + value = value * 10 + (*p - '0'); + has_num = 1; + p++; } if (*p == '.') { + p++; + double frac = 0.1; + while (*p >= '0' && *p <= '9') { + value += (*p - '0') * frac; + frac *= 0.1; + has_num = 1; p++; - double frac = 0.1; - while (*p >= '0' && *p <= '9') { - value += (*p - '0') * frac; - frac *= 0.1; - has_num = 1; - p++; - } + } } if (*p == '/' && has_num) { + p++; + double denom = 0.0; + int has_denom = 0; + while (*p >= '0' && *p <= '9') { + denom = denom * 10 + (*p - '0'); + has_denom = 1; p++; - double denom = 0.0; - int has_denom = 0; - while (*p >= '0' && *p <= '9') { - denom = denom * 10 + (*p - '0'); - has_denom = 1; - p++; - } - if (has_denom && denom > 0) { - value = value / denom; - } + } + if (has_denom && denom > 0) { + value = value / denom; + } } while (*p == '\'' || *p == '"') { - double extra = 0.0; + double extra = 0.0; + p++; + while (*p >= '0' && *p <= '9') { + extra = extra * 10 + (*p - '0'); + p++; + } + if (*p == '.') { p++; + double frac = 0.1; while (*p >= '0' && *p <= '9') { - extra = extra * 10 + (*p - '0'); - p++; - } - if (*p == '.') { - p++; - double frac = 0.1; - while (*p >= '0' && *p <= '9') { - extra += (*p - '0') * frac; - frac *= 0.1; - p++; - } + extra += (*p - '0') * frac; + frac *= 0.1; + p++; } - if (*p == '\'' || *p == '"') p++; - value += extra * (p[-1] == '\'' ? 0.3048 : 0.0254); + } + if (*p == '\'' || *p == '"') p++; + value += extra * (p[-1] == '\'' ? 0.3048 : 0.0254); } if (!has_num) { - *ptr = p; - return 0.0; + *ptr = p; + return 0.0; } *ptr = p; @@ -235,29 +235,29 @@ static int parse_conversion_query(const char *query, double *value, const UnitDe const char *to_pos = NULL; size_t keyword_len = 0; for (int i = 0; to_keywords[i]; i++) { - const char *found = strstr(p, to_keywords[i]); - if (found) { - to_pos = found + strlen(to_keywords[i]); - keyword_len = strlen(to_keywords[i]); - break; - } + const char *found = strstr(p, to_keywords[i]); + if (found) { + to_pos = found + strlen(to_keywords[i]); + keyword_len = strlen(to_keywords[i]); + break; + } } if (!to_pos) { - const char *last_space = strrchr(p, ' '); - if (last_space && last_space > p) { - char from_part[64] = {0}; - size_t len = last_space - p; - if (len < 63) { - strncpy(from_part, p, len); - *from_unit = find_unit(from_part); - if (*from_unit) { - *to_unit = find_unit(last_space + 1); - return *to_unit ? 1 : 0; - } - } + const char *last_space = strrchr(p, ' '); + if (last_space && last_space > p) { + char from_part[64] = {0}; + size_t len = last_space - p; + if (len < 63) { + strncpy(from_part, p, len); + *from_unit = find_unit(from_part); + if (*from_unit) { + *to_unit = find_unit(last_space + 1); + return *to_unit ? 1 : 0; + } } - return 0; + } + return 0; } char from_part[64] = {0}; @@ -271,20 +271,20 @@ static int parse_conversion_query(const char *query, double *value, const UnitDe *from_unit = find_unit(from_part); if (!*from_unit) { - char *end = from_part + strlen(from_part); - while (end > from_part) { - while (end > from_part && is_whitespace(end[-1])) end--; - if (end <= from_part) break; - char *start = end; - while (start > from_part && !is_whitespace(start[-1])) start--; - size_t word_len = end - start; - memmove(from_part + word_len + 1, from_part, start - from_part); - from_part[word_len] = ' '; - from_part[word_len + 1] = '\0'; - *from_unit = find_unit(from_part); - if (*from_unit) break; - end = start; - } + char *end = from_part + strlen(from_part); + while (end > from_part) { + while (end > from_part && is_whitespace(end[-1])) end--; + if (end <= from_part) break; + char *start = end; + while (start > from_part && !is_whitespace(start[-1])) start--; + size_t word_len = end - start; + memmove(from_part + word_len + 1, from_part, start - from_part); + from_part[word_len] = ' '; + from_part[word_len + 1] = '\0'; + *from_unit = find_unit(from_part); + if (*from_unit) break; + end = start; + } } if (!*from_unit) return 0; @@ -297,30 +297,30 @@ static int parse_conversion_query(const char *query, double *value, const UnitDe size_t to_len = 0; const char *tp = to_pos; while (*tp && !is_separator(*tp) && to_len < 63) { - to_part[to_len++] = *tp++; + to_part[to_len++] = *tp++; } to_part[to_len] = '\0'; *to_unit = find_unit(to_part); if (!*to_unit) { - const char *try_ptr = to_pos; - while (*try_ptr && is_whitespace(*try_ptr)) try_ptr++; - char try_buf[64] = {0}; - size_t try_len = 0; - while (*try_ptr && try_len < 63) { - try_buf[try_len++] = *try_ptr++; - } - while (try_len > 0) { - *to_unit = find_unit(try_buf); - if (*to_unit) { - strcpy(to_part, try_buf); - break; - } - char *last_space = strrchr(try_buf, ' '); - if (!last_space) break; - *last_space = '\0'; - try_len = strlen(try_buf); + const char *try_ptr = to_pos; + while (*try_ptr && is_whitespace(*try_ptr)) try_ptr++; + char try_buf[64] = {0}; + size_t try_len = 0; + while (*try_ptr && try_len < 63) { + try_buf[try_len++] = *try_ptr++; + } + while (try_len > 0) { + *to_unit = find_unit(try_buf); + if (*to_unit) { + strcpy(to_part, try_buf); + break; } + char *last_space = strrchr(try_buf, ' '); + if (!last_space) break; + *last_space = '\0'; + try_len = strlen(try_buf); + } } return *to_unit ? 1 : 0; @@ -343,7 +343,7 @@ static double convert_value(double value, const UnitDef *from, const UnitDef *to if (from->type != to->type) return 0; if (from->type == UNIT_TEMP) { - return convert_temp(value, from, to); + return convert_temp(value, from, to); } double base_value = value * from->to_base; @@ -353,23 +353,23 @@ static double convert_value(double value, const UnitDef *from, const UnitDef *to static void format_number(double val, char *buf, size_t bufsize) { if (bufsize == 0) return; if (val == 0) { - snprintf(buf, bufsize, "0"); - return; + snprintf(buf, bufsize, "0"); + return; } if (fabs(val) < 0.01 && fabs(val) > 0) { - snprintf(buf, bufsize, "%.2g", val); + snprintf(buf, bufsize, "%.2g", val); } else if (fabs(val) < 1) { - snprintf(buf, bufsize, "%.2f", val); - char *p = buf + strlen(buf) - 1; - while (p > buf && *p == '0') *p-- = '\0'; - if (*p == '.') *p = '\0'; + snprintf(buf, bufsize, "%.2f", val); + char *p = buf + strlen(buf) - 1; + while (p > buf && *p == '0') *p-- = '\0'; + if (*p == '.') *p = '\0'; } else if (fmod(val + 0.0001, 1.0) < 0.0002) { - snprintf(buf, bufsize, "%.0f", val); + snprintf(buf, bufsize, "%.0f", val); } else { - snprintf(buf, bufsize, "%.2f", val); - char *p = buf + strlen(buf) - 1; - while (p > buf && *p == '0') *p-- = '\0'; - if (*p == '.') *p = '\0'; + snprintf(buf, bufsize, "%.2f", val); + char *p = buf + strlen(buf) - 1; + while (p > buf && *p == '0') *p-- = '\0'; + if (*p == '.') *p = '\0'; } } @@ -383,74 +383,74 @@ static const char *pluralize(const char *unit, double value, char *buf, size_t b buf[bufsize - 1] = '\0'; if (strcmp(unit, "foot") == 0 || strcmp(unit, "square foot") == 0) { - if (is_one) strcpy(buf, unit); - else strcpy(buf, strcmp(unit, "square foot") == 0 ? "square feet" : "feet"); - return buf; + if (is_one) strcpy(buf, unit); + else strcpy(buf, strcmp(unit, "square foot") == 0 ? "square feet" : "feet"); + return buf; } if (strcmp(unit, "inch") == 0 || strcmp(unit, "square inch") == 0) { - if (is_one) strcpy(buf, unit); - else strcpy(buf, strcmp(unit, "square inch") == 0 ? "square inches" : "inches"); - return buf; + if (is_one) strcpy(buf, unit); + else strcpy(buf, strcmp(unit, "square inch") == 0 ? "square inches" : "inches"); + return buf; } if (strcmp(unit, "stone") == 0) { - if (is_one) strcpy(buf, "stone"); - else strcpy(buf, "stones"); - return buf; + if (is_one) strcpy(buf, "stone"); + else strcpy(buf, "stones"); + return buf; } if (strcmp(unit, "celsius") == 0 || - strcmp(unit, "fahrenheit") == 0 || - strcmp(unit, "kelvin") == 0) { - strcpy(buf, unit); - return buf; + strcmp(unit, "fahrenheit") == 0 || + strcmp(unit, "kelvin") == 0) { + strcpy(buf, unit); + return buf; } if (unit[len-1] == 's' || - unit[len-1] == 'x' || - unit[len-1] == 'z' || - (len >= 2 && unit[len-2] == 'c' && unit[len-1] == 'h') || - (len >= 2 && unit[len-2] == 's' && unit[len-1] == 'h')) { - if (!is_one) { - buf[len] = 'e'; - buf[len+1] = '\0'; - } + unit[len-1] == 'x' || + unit[len-1] == 'z' || + (len >= 2 && unit[len-2] == 'c' && unit[len-1] == 'h') || + (len >= 2 && unit[len-2] == 's' && unit[len-1] == 'h')) { + if (!is_one) { + buf[len] = 'e'; + buf[len+1] = '\0'; + } } else if (unit[len-1] == 'y' && len >= 2 && - !(unit[len-2] == 'a' || unit[len-2] == 'e' || - unit[len-2] == 'i' || unit[len-2] == 'o' || - unit[len-2] == 'u')) { - if (is_one) { - buf[len-1] = '\0'; - } else { - buf[len] = 's'; - buf[len+1] = '\0'; - } + !(unit[len-2] == 'a' || unit[len-2] == 'e' || + unit[len-2] == 'i' || unit[len-2] == 'o' || + unit[len-2] == 'u')) { + if (is_one) { + buf[len-1] = '\0'; + } else { + buf[len] = 's'; + buf[len+1] = '\0'; + } } else if (len >= 2 && unit[len-2] == 'f' && unit[len-1] == 'e') { - if (is_one) { - buf[len-2] = '\0'; - } else { - buf[len-1] = 's'; - buf[len] = '\0'; - } + if (is_one) { + buf[len-2] = '\0'; + } else { + buf[len-1] = 's'; + buf[len] = '\0'; + } } else if (unit[len-1] == 'f' && len >= 1) { - if (is_one) { - buf[len-1] = '\0'; - } else { - buf[len-1] = 'v'; - buf[len] = 'e'; - buf[len+1] = 's'; - buf[len+2] = '\0'; - } + if (is_one) { + buf[len-1] = '\0'; + } else { + buf[len-1] = 'v'; + buf[len] = 'e'; + buf[len+1] = 's'; + buf[len+2] = '\0'; + } } else if (unit[len-1] == 'e' && len >= 2 && unit[len-2] == 'f') { - if (is_one) { - buf[len-2] = '\0'; - } else { - buf[len-1] = 's'; - buf[len] = '\0'; - } + if (is_one) { + buf[len-2] = '\0'; } else { - if (!is_one) { - buf[len] = 's'; - buf[len+1] = '\0'; - } + buf[len-1] = 's'; + buf[len] = '\0'; + } + } else { + if (!is_one) { + buf[len] = 's'; + buf[len+1] = '\0'; + } } return buf; @@ -466,12 +466,12 @@ static char *build_html(double value, const UnitDef *from, double result, const pluralize(to->name, result, to_name_buf, sizeof(to_name_buf)); int n = snprintf(html, sizeof(html), - "<div class='unit-conv-container' style='line-height: 1.6;'>" - "<div style='font-size: 1.3em; margin-bottom: 8px;'>" - "<b>%s %s</b> = <b>%s %s</b>" - "</div>", - val_buf, from_name_buf, - res_buf, to_name_buf); + "<div class='unit-conv-container' style='line-height: 1.6;'>" + "<div style='font-size: 1.3em; margin-bottom: 8px;'>" + "<b>%s %s</b> = <b>%s %s</b>" + "</div>", + val_buf, from_name_buf, + res_buf, to_name_buf); snprintf(html + n, sizeof(html) - n, "</div>"); return html; } diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c index dff26f6..09c13c6 100644 --- a/src/Infobox/Wikipedia.c +++ b/src/Infobox/Wikipedia.c @@ -23,32 +23,32 @@ static void shorten_summary(char **extract_ptr, int max_chars) { int end_pos = max_chars; for (int i = max_chars; i > (max_chars / 2); i--) { - if (text[i] == '.' || text[i] == '!' || text[i] == '?') { - end_pos = i + 1; - break; - } + if (text[i] == '.' || text[i] == '!' || text[i] == '?') { + end_pos = i + 1; + break; + } } char *new_text = (char *)malloc(end_pos + 4); if (new_text) { - strncpy(new_text, text, end_pos); - new_text[end_pos] = '\0'; - strcat(new_text, "..."); - free(*extract_ptr); - *extract_ptr = new_text; + strncpy(new_text, text, end_pos); + new_text[end_pos] = '\0'; + strcat(new_text, "..."); + free(*extract_ptr); + *extract_ptr = new_text; } } static size_t WikiWriteMemoryCallback(void *contents, size_t size, size_t nmemb, - void *userp) { + void *userp) { size_t realsize = size * nmemb; struct WikiMemoryStruct *mem = (struct WikiMemoryStruct *)userp; char *ptr = realloc(mem->memory, mem->size + realsize + 1); if (ptr == NULL) { - fprintf(stderr, "Not enough memory (realloc returned NULL)\n"); - return 0; + fprintf(stderr, "Not enough memory (realloc returned NULL)\n"); + return 0; } mem->memory = ptr; @@ -63,48 +63,48 @@ static void extract_wiki_info(xmlNode *node, InfoBox *info) { xmlNode *cur_node = NULL; for (cur_node = node; cur_node; cur_node = cur_node->next) { - if (cur_node->type == XML_ELEMENT_NODE) { - if (strcmp((const char *)cur_node->name, "page") == 0) { - xmlChar *title = xmlGetProp(cur_node, (const xmlChar *)"title"); - if (title) { - info->title = strdup((const char *)title); - - const char *base_article_url = "https://en.wikipedia.org/wiki/"; - char *formatted_title = strdup((const char *)title); - for (int i = 0; formatted_title[i]; i++) { - if (formatted_title[i] == ' ') formatted_title[i] = '_'; - } - - info->url = - malloc(strlen(base_article_url) + strlen(formatted_title) + 1); - if (info->url) { - strcpy(info->url, base_article_url); - strcat(info->url, formatted_title); - } - free(formatted_title); - xmlFree(title); - } + if (cur_node->type == XML_ELEMENT_NODE) { + if (strcmp((const char *)cur_node->name, "page") == 0) { + xmlChar *title = xmlGetProp(cur_node, (const xmlChar *)"title"); + if (title) { + info->title = strdup((const char *)title); + + const char *base_article_url = "https://en.wikipedia.org/wiki/"; + char *formatted_title = strdup((const char *)title); + for (int i = 0; formatted_title[i]; i++) { + if (formatted_title[i] == ' ') formatted_title[i] = '_'; } - if (strcmp((const char *)cur_node->name, "thumbnail") == 0) { - xmlChar *source = xmlGetProp(cur_node, (const xmlChar *)"source"); - if (source) { - info->thumbnail_url = strdup((const char *)source); - xmlFree(source); - } + info->url = + malloc(strlen(base_article_url) + strlen(formatted_title) + 1); + if (info->url) { + strcpy(info->url, base_article_url); + strcat(info->url, formatted_title); } + free(formatted_title); + xmlFree(title); + } + } + + if (strcmp((const char *)cur_node->name, "thumbnail") == 0) { + xmlChar *source = xmlGetProp(cur_node, (const xmlChar *)"source"); + if (source) { + info->thumbnail_url = strdup((const char *)source); + xmlFree(source); + } + } - if (strcmp((const char *)cur_node->name, "extract") == 0) { - xmlChar *content = xmlNodeGetContent(cur_node); - if (content) { - info->extract = strdup((const char *)content); + if (strcmp((const char *)cur_node->name, "extract") == 0) { + xmlChar *content = xmlNodeGetContent(cur_node); + if (content) { + info->extract = strdup((const char *)content); - shorten_summary(&(info->extract), 300); - xmlFree(content); - } - } + shorten_summary(&(info->extract), 300); + xmlFree(content); + } } - extract_wiki_info(cur_node->children, info); + } + extract_wiki_info(cur_node->children, info); } } @@ -120,27 +120,27 @@ InfoBox fetch_wiki_data(char *api_url) { curl_handle = curl_easy_init(); if (curl_handle) { - curl_easy_setopt(curl_handle, CURLOPT_URL, api_url); - curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, - WikiWriteMemoryCallback); - curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); - curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); - apply_proxy_settings(curl_handle); - - res = curl_easy_perform(curl_handle); - - if (res == CURLE_OK) { - xmlDocPtr doc = - xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0); - if (doc != NULL) { - xmlNode *root_element = xmlDocGetRootElement(doc); - extract_wiki_info(root_element, &info); - xmlFreeDoc(doc); - } + curl_easy_setopt(curl_handle, CURLOPT_URL, api_url); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, + WikiWriteMemoryCallback); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); + curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); + apply_proxy_settings(curl_handle); + + res = curl_easy_perform(curl_handle); + + if (res == CURLE_OK) { + xmlDocPtr doc = + xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0); + if (doc != NULL) { + xmlNode *root_element = xmlDocGetRootElement(doc); + extract_wiki_info(root_element, &info); + xmlFreeDoc(doc); } + } - curl_easy_cleanup(curl_handle); - free(chunk.memory); + curl_easy_cleanup(curl_handle); + free(chunk.memory); } return info; @@ -152,14 +152,14 @@ char *construct_wiki_url(const char *search_term) { char *escaped_term = curl_easy_escape(curl, search_term, 0); const char *base = - "https://en.wikipedia.org/w/" - "api.php?action=query&prop=extracts|pageimages&exintro&" - "explaintext&pithumbsize=400&format=xml&origin=*&titles="; + "https://en.wikipedia.org/w/" + "api.php?action=query&prop=extracts|pageimages&exintro&" + "explaintext&pithumbsize=400&format=xml&origin=*&titles="; char *full_url = malloc(strlen(base) + strlen(escaped_term) + 1); if (full_url) { - strcpy(full_url, base); - strcat(full_url, escaped_term); + strcpy(full_url, base); + strcat(full_url, escaped_term); } curl_free(escaped_term); @@ -14,9 +14,9 @@ #include "Routes/Search.h" int handle_opensearch(UrlParams *params) { - (void)params; - serve_static_file_with_mime("opensearch.xml", "application/opensearchdescription+xml"); - return 0; + (void)params; + serve_static_file_with_mime("opensearch.xml", "application/opensearchdescription+xml"); + return 0; } int main() { @@ -31,32 +31,32 @@ int main() { curl_global_init(CURL_GLOBAL_DEFAULT); Config config = { - .host = "0.0.0.0", - .port = 5000, - .proxy = "", - .proxy_list_file = "", - .max_proxy_retries = 3, - .randomize_username = 0, - .randomize_password = 0 + .host = "0.0.0.0", + .port = 5000, + .proxy = "", + .proxy_list_file = "", + .max_proxy_retries = 3, + .randomize_username = 0, + .randomize_password = 0 }; if (load_config("config.ini", &config) != 0) { - fprintf(stderr, "Warning: Could not load config file, using defaults\n"); + fprintf(stderr, "Warning: Could not load config file, using defaults\n"); } if (config.proxy_list_file[0] != '\0') { - if (load_proxy_list(config.proxy_list_file) < 0) { - fprintf(stderr, "Warning: Failed to load proxy list, continuing without proxies\n"); - } + if (load_proxy_list(config.proxy_list_file) < 0) { + fprintf(stderr, "Warning: Failed to load proxy list, continuing without proxies\n"); + } } max_proxy_retries = config.max_proxy_retries; set_proxy_config(config.proxy, config.randomize_username, config.randomize_password); if (proxy_url[0] != '\0') { - fprintf(stderr, "Using proxy: %s\n", proxy_url); + fprintf(stderr, "Using proxy: %s\n", proxy_url); } else if (proxy_count > 0) { - fprintf(stderr, "Using %d proxies from %s\n", proxy_count, config.proxy_list_file); + fprintf(stderr, "Using %d proxies from %s\n", proxy_count, config.proxy_list_file); } set_handler("/", home_handler); @@ -70,10 +70,10 @@ int main() { int result = beaker_run(config.host, config.port); if (result != 0) { - fprintf(stderr, "Error: Beaker server failed to start.\n"); - curl_global_cleanup(); - xmlCleanupParser(); - return EXIT_FAILURE; + fprintf(stderr, "Error: Beaker server failed to start.\n"); + curl_global_cleanup(); + xmlCleanupParser(); + return EXIT_FAILURE; } curl_global_cleanup(); diff --git a/src/Proxy/Proxy.c b/src/Proxy/Proxy.c index 939aea0..9908350 100644 --- a/src/Proxy/Proxy.c +++ b/src/Proxy/Proxy.c @@ -17,15 +17,15 @@ static const char RAND_CHARS[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRST static void generate_random_string(char *buf, size_t len) { for (size_t i = 0; i < len - 1; i++) { - buf[i] = RAND_CHARS[rand() % (sizeof(RAND_CHARS) - 1)]; + buf[i] = RAND_CHARS[rand() % (sizeof(RAND_CHARS) - 1)]; } buf[len - 1] = '\0'; } void set_proxy_config(const char *proxy_str, int rand_user, int rand_pass) { if (proxy_str && proxy_str[0]) { - strncpy(proxy_url, proxy_str, sizeof(proxy_url) - 1); - proxy_url[sizeof(proxy_url) - 1] = '\0'; + strncpy(proxy_url, proxy_str, sizeof(proxy_url) - 1); + proxy_url[sizeof(proxy_url) - 1] = '\0'; } randomize_username = rand_user; randomize_password = rand_pass; @@ -40,49 +40,49 @@ static Proxy parse_proxy_line(const char *line) { if (len == 0) return proxy; if (strncmp(line, "http://", 7) == 0) { - proxy.type = PROXY_HTTP; - host_start = line + 7; + proxy.type = PROXY_HTTP; + host_start = line + 7; } else if (strncmp(line, "socks5://", 9) == 0) { - proxy.type = PROXY_SOCKS5; - host_start = line + 9; + proxy.type = PROXY_SOCKS5; + host_start = line + 9; } else if (strncmp(line, "socks4://", 9) == 0) { - proxy.type = PROXY_SOCKS4; - host_start = line + 9; + proxy.type = PROXY_SOCKS4; + host_start = line + 9; } else { - host_start = line; + host_start = line; } const char *at = strchr(host_start, '@'); if (at) { - char cred_buf[128]; - size_t cred_len = at - host_start; - if (cred_len >= sizeof(cred_buf)) cred_len = sizeof(cred_buf) - 1; - strncpy(cred_buf, host_start, cred_len); - cred_buf[cred_len] = '\0'; - - char *colon = strchr(cred_buf, ':'); - if (colon) { - size_t user_len = colon - cred_buf; - if (user_len >= sizeof(proxy.username)) user_len = sizeof(proxy.username) - 1; - strncpy(proxy.username, cred_buf, user_len); - proxy.username[user_len] = '\0'; - strncpy(proxy.password, colon + 1, sizeof(proxy.password) - 1); - proxy.password[sizeof(proxy.password) - 1] = '\0'; - } - host_start = at + 1; + char cred_buf[128]; + size_t cred_len = at - host_start; + if (cred_len >= sizeof(cred_buf)) cred_len = sizeof(cred_buf) - 1; + strncpy(cred_buf, host_start, cred_len); + cred_buf[cred_len] = '\0'; + + char *colon = strchr(cred_buf, ':'); + if (colon) { + size_t user_len = colon - cred_buf; + if (user_len >= sizeof(proxy.username)) user_len = sizeof(proxy.username) - 1; + strncpy(proxy.username, cred_buf, user_len); + proxy.username[user_len] = '\0'; + strncpy(proxy.password, colon + 1, sizeof(proxy.password) - 1); + proxy.password[sizeof(proxy.password) - 1] = '\0'; + } + host_start = at + 1; } port_start = strchr(host_start, ':'); if (port_start) { - char host_buf[256]; - size_t host_len = port_start - host_start; - if (host_len >= sizeof(host_buf)) host_len = sizeof(host_buf) - 1; - strncpy(host_buf, host_start, host_len); - host_buf[host_len] = '\0'; - snprintf(proxy.host, sizeof(proxy.host), "%.*s", (int)host_len, host_buf); - proxy.port = atoi(port_start + 1); + char host_buf[256]; + size_t host_len = port_start - host_start; + if (host_len >= sizeof(host_buf)) host_len = sizeof(host_buf) - 1; + strncpy(host_buf, host_start, host_len); + host_buf[host_len] = '\0'; + snprintf(proxy.host, sizeof(proxy.host), "%.*s", (int)host_len, host_buf); + proxy.port = atoi(port_start + 1); } else { - snprintf(proxy.host, sizeof(proxy.host), "%s", host_start); + snprintf(proxy.host, sizeof(proxy.host), "%s", host_start); } return proxy; @@ -90,71 +90,71 @@ static Proxy parse_proxy_line(const char *line) { int load_proxy_list(const char *filename) { if (!filename || filename[0] == '\0') { - return 0; + return 0; } pthread_mutex_lock(&proxy_mutex); if (proxy_list) { - free(proxy_list); - proxy_list = NULL; + free(proxy_list); + proxy_list = NULL; } proxy_count = 0; FILE *file = fopen(filename, "r"); if (!file) { - pthread_mutex_unlock(&proxy_mutex); - fprintf(stderr, "[WARN] Could not open proxy list file: %s\n", filename); - return -1; + pthread_mutex_unlock(&proxy_mutex); + fprintf(stderr, "[WARN] Could not open proxy list file: %s\n", filename); + return -1; } int capacity = 16; proxy_list = (Proxy *)malloc(capacity * sizeof(Proxy)); if (!proxy_list) { - fclose(file); - return -1; + fclose(file); + return -1; } proxy_count = 0; char line[512]; while (fgets(line, sizeof(line), file)) { - line[strcspn(line, "\r\n")] = 0; + line[strcspn(line, "\r\n")] = 0; - if (line[0] == '\0' || line[0] == '#') { - continue; - } + if (line[0] == '\0' || line[0] == '#') { + continue; + } - char *p = line; - while (*p == ' ' || *p == '\t') p++; + char *p = line; + while (*p == ' ' || *p == '\t') p++; - char *end = p + strlen(p) - 1; - while (end > p && (*end == ' ' || *end == '\t')) { - *end = '\0'; - end--; - } + char *end = p + strlen(p) - 1; + while (end > p && (*end == ' ' || *end == '\t')) { + *end = '\0'; + end--; + } - if (p[0] == '\0') continue; + if (p[0] == '\0') continue; - Proxy proxy = parse_proxy_line(p); - if (proxy.port == 0) { - continue; - } + Proxy proxy = parse_proxy_line(p); + if (proxy.port == 0) { + continue; + } - if (proxy_count >= capacity) { - capacity *= 2; - Proxy *new_list = (Proxy *)realloc(proxy_list, capacity * sizeof(Proxy)); - if (!new_list) { - free(proxy_list); - proxy_list = NULL; - proxy_count = 0; - fclose(file); - pthread_mutex_unlock(&proxy_mutex); - return -1; - } - proxy_list = new_list; + if (proxy_count >= capacity) { + capacity *= 2; + Proxy *new_list = (Proxy *)realloc(proxy_list, capacity * sizeof(Proxy)); + if (!new_list) { + free(proxy_list); + proxy_list = NULL; + proxy_count = 0; + fclose(file); + pthread_mutex_unlock(&proxy_mutex); + return -1; } + proxy_list = new_list; + } - proxy_list[proxy_count++] = proxy; + proxy_list[proxy_count++] = proxy; } fclose(file); @@ -166,8 +166,8 @@ int load_proxy_list(const char *filename) { void free_proxy_list(void) { pthread_mutex_lock(&proxy_mutex); if (proxy_list) { - free(proxy_list); - proxy_list = NULL; + free(proxy_list); + proxy_list = NULL; } proxy_count = 0; pthread_mutex_unlock(&proxy_mutex); @@ -176,8 +176,8 @@ void free_proxy_list(void) { Proxy *get_random_proxy(void) { pthread_mutex_lock(&proxy_mutex); if (proxy_count == 0) { - pthread_mutex_unlock(&proxy_mutex); - return NULL; + pthread_mutex_unlock(&proxy_mutex); + return NULL; } int start = rand() % proxy_count; @@ -185,19 +185,19 @@ Proxy *get_random_proxy(void) { Proxy *selected = NULL; while (checked < proxy_count) { - int idx = (start + checked) % proxy_count; - if (proxy_list[idx].failures < max_proxy_retries) { - selected = &proxy_list[idx]; - break; - } - checked++; + int idx = (start + checked) % proxy_count; + if (proxy_list[idx].failures < max_proxy_retries) { + selected = &proxy_list[idx]; + break; + } + checked++; } if (!selected) { - for (int i = 0; i < proxy_count; i++) { - proxy_list[i].failures = 0; - } - selected = &proxy_list[rand() % proxy_count]; + for (int i = 0; i < proxy_count; i++) { + proxy_list[i].failures = 0; + } + selected = &proxy_list[rand() % proxy_count]; } pthread_mutex_unlock(&proxy_mutex); @@ -213,45 +213,45 @@ void record_proxy_failure(Proxy *proxy) { void apply_proxy_settings(CURL *curl) { if (proxy_url[0] != '\0') { - curl_easy_setopt(curl, CURLOPT_PROXY, proxy_url); - if (strncmp(proxy_url, "socks5://", 9) == 0) { - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5); - } else if (strncmp(proxy_url, "socks4://", 9) == 0) { - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A); - } else { - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); - } + curl_easy_setopt(curl, CURLOPT_PROXY, proxy_url); + if (strncmp(proxy_url, "socks5://", 9) == 0) { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5); + } else if (strncmp(proxy_url, "socks4://", 9) == 0) { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A); + } else { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); + } - if (randomize_username || randomize_password) { - char userpwd[256]; - char username[32] = {0}; - char password[32] = {0}; + if (randomize_username || randomize_password) { + char userpwd[256]; + char username[32] = {0}; + char password[32] = {0}; - if (randomize_username) generate_random_string(username, sizeof(username)); - if (randomize_password) generate_random_string(password, sizeof(password)); + if (randomize_username) generate_random_string(username, sizeof(username)); + if (randomize_password) generate_random_string(password, sizeof(password)); - snprintf(userpwd, sizeof(userpwd), "%s:%s", username, password); - curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, userpwd); - } + snprintf(userpwd, sizeof(userpwd), "%s:%s", username, password); + curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, userpwd); + } } else if (proxy_count > 0) { - Proxy *proxy = get_random_proxy(); - if (proxy) { - char proxy_url_buf[512]; - snprintf(proxy_url_buf, sizeof(proxy_url_buf), "%s:%d", proxy->host, proxy->port); - curl_easy_setopt(curl, CURLOPT_PROXY, proxy_url_buf); - if (proxy->type == PROXY_HTTP) { - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); - } else if (proxy->type == PROXY_SOCKS4) { - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A); - } else { - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5); - } - - if (proxy->username[0] != '\0' || proxy->password[0] != '\0') { - char userpwd[128]; - snprintf(userpwd, sizeof(userpwd), "%s:%s", proxy->username, proxy->password); - curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, userpwd); - } + Proxy *proxy = get_random_proxy(); + if (proxy) { + char proxy_url_buf[512]; + snprintf(proxy_url_buf, sizeof(proxy_url_buf), "%s:%d", proxy->host, proxy->port); + curl_easy_setopt(curl, CURLOPT_PROXY, proxy_url_buf); + if (proxy->type == PROXY_HTTP) { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); + } else if (proxy->type == PROXY_SOCKS4) { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A); + } else { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5); } + + if (proxy->username[0] != '\0' || proxy->password[0] != '\0') { + char userpwd[128]; + snprintf(userpwd, sizeof(userpwd), "%s:%s", proxy->username, proxy->password); + curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, userpwd); + } + } } } diff --git a/src/Routes/Home.c b/src/Routes/Home.c index 81370ba..4526a9d 100644 --- a/src/Routes/Home.c +++ b/src/Routes/Home.c @@ -2,13 +2,13 @@ #include <stdlib.h> int home_handler(UrlParams *params) { - (void)params; - TemplateContext ctx = new_context(); - char *rendered_html = render_template("home.html", &ctx); - send_response(rendered_html); + (void)params; + TemplateContext ctx = new_context(); + char *rendered_html = render_template("home.html", &ctx); + send_response(rendered_html); - free(rendered_html); - free_context(&ctx); + free(rendered_html); + free_context(&ctx); - return 0; + return 0; } diff --git a/src/Routes/ImageProxy.c b/src/Routes/ImageProxy.c index 5141cd5..10fbd19 100644 --- a/src/Routes/ImageProxy.c +++ b/src/Routes/ImageProxy.c @@ -16,9 +16,9 @@ typedef struct { static int is_allowed_domain(const char *url) { const char *protocol = strstr(url, "://"); if (!protocol) { - protocol = url; + protocol = url; } else { - protocol += 3; + protocol += 3; } const char *path = strchr(protocol, '/'); @@ -26,49 +26,49 @@ static int is_allowed_domain(const char *url) { char host[256] = {0}; if (host_len >= sizeof(host)) { - host_len = sizeof(host) - 1; + host_len = sizeof(host) - 1; } strncpy(host, protocol, host_len); const char *allowed_domains[] = { - "mm.bing.net", - "th.bing.com", - NULL + "mm.bing.net", + "th.bing.com", + NULL }; for (int i = 0; allowed_domains[i] != NULL; i++) { - size_t domain_len = strlen(allowed_domains[i]); - size_t host_str_len = strlen(host); - - if (host_str_len >= domain_len) { - const char *suffix = host + host_str_len - domain_len; - if (strcmp(suffix, allowed_domains[i]) == 0) { - return 1; - } + size_t domain_len = strlen(allowed_domains[i]); + size_t host_str_len = strlen(host); + + if (host_str_len >= domain_len) { + const char *suffix = host + host_str_len - domain_len; + if (strcmp(suffix, allowed_domains[i]) == 0) { + return 1; } } + } return 0; } static size_t write_callback(void *contents, size_t size, size_t nmemb, - void *userp) { + void *userp) { size_t realsize = size * nmemb; MemoryBuffer *buf = (MemoryBuffer *)userp; if (buf->size + realsize > MAX_IMAGE_SIZE) { - return 0; + return 0; } if (buf->size + realsize > buf->capacity) { - size_t new_capacity = buf->capacity * 2; - if (new_capacity < buf->size + realsize) { - new_capacity = buf->size + realsize; - } - char *new_data = realloc(buf->data, new_capacity); - if (!new_data) return 0; - buf->data = new_data; - buf->capacity = new_capacity; + size_t new_capacity = buf->capacity * 2; + if (new_capacity < buf->size + realsize) { + new_capacity = buf->size + realsize; + } + char *new_data = realloc(buf->data, new_capacity); + if (!new_data) return 0; + buf->data = new_data; + buf->capacity = new_capacity; } memcpy(buf->data + buf->size, contents, realsize); @@ -79,38 +79,38 @@ static size_t write_callback(void *contents, size_t size, size_t nmemb, int image_proxy_handler(UrlParams *params) { const char *url = NULL; for (int i = 0; i < params->count; i++) { - if (strcmp(params->params[i].key, "url") == 0) { - url = params->params[i].value; - break; - } + if (strcmp(params->params[i].key, "url") == 0) { + url = params->params[i].value; + break; + } } if (!url || strlen(url) == 0) { - send_response("Missing 'url' parameter"); - return 0; + send_response("Missing 'url' parameter"); + return 0; } if (!is_allowed_domain(url)) { - send_response("Domain not allowed"); - return 0; + send_response("Domain not allowed"); + return 0; } CURL *curl = curl_easy_init(); if (!curl) { - send_response("Failed to initialize curl"); - return 0; + send_response("Failed to initialize curl"); + return 0; } MemoryBuffer buf = { - .data = malloc(8192), - .size = 0, - .capacity = 8192 + .data = malloc(8192), + .size = 0, + .capacity = 8192 }; if (!buf.data) { - curl_easy_cleanup(curl); - send_response("Memory allocation failed"); - return 0; + curl_easy_cleanup(curl); + send_response("Memory allocation failed"); + return 0; } curl_easy_setopt(curl, CURLOPT_URL, url); @@ -130,15 +130,15 @@ int image_proxy_handler(UrlParams *params) { char content_type[64] = {0}; if (content_type_ptr) { - strncpy(content_type, content_type_ptr, sizeof(content_type) - 1); + strncpy(content_type, content_type_ptr, sizeof(content_type) - 1); } curl_easy_cleanup(curl); if (res != CURLE_OK || response_code != 200) { - free(buf.data); - send_response("Failed to fetch image"); - return 0; + free(buf.data); + send_response("Failed to fetch image"); + return 0; } const char *mime_type = strlen(content_type) > 0 ? content_type : "image/jpeg"; diff --git a/src/Routes/Images.c b/src/Routes/Images.c index e96d6fd..b997112 100644 --- a/src/Routes/Images.c +++ b/src/Routes/Images.c @@ -17,12 +17,12 @@ struct MemoryBlock { }; static size_t ImageWriteCallback(void *data, size_t size, size_t nmemb, - void *userp) { + void *userp) { size_t realsize = size * nmemb; struct MemoryBlock *mem = (struct MemoryBlock *)userp; char *ptr = (char *)realloc(mem->response, mem->size + realsize + 1); if (ptr == NULL) { - return 0; + return 0; } mem->response = ptr; memcpy(&(mem->response[mem->size]), data, realsize); @@ -35,30 +35,30 @@ static char *fetch_images_html(const char *url) { CURL *curl_handle; struct MemoryBlock chunk = {.response = malloc(1), .size = 0}; if (!chunk.response) { - return NULL; + return NULL; } curl_handle = curl_easy_init(); if (!curl_handle) { - free(chunk.response); - return NULL; + free(chunk.response); + return NULL; } curl_easy_setopt(curl_handle, CURLOPT_URL, url); curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, ImageWriteCallback); curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); curl_easy_setopt( - curl_handle, CURLOPT_USERAGENT, - "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"); + curl_handle, CURLOPT_USERAGENT, + "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"); curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L); apply_proxy_settings(curl_handle); CURLcode res = curl_easy_perform(curl_handle); if (res != CURLE_OK) { - free(chunk.response); - curl_easy_cleanup(curl_handle); - return NULL; + free(chunk.response); + curl_easy_cleanup(curl_handle); + return NULL; } curl_easy_cleanup(curl_handle); @@ -71,15 +71,15 @@ int images_handler(UrlParams *params) { int page = 1; if (params) { - for (int i = 0; i < params->count; i++) { - if (strcmp(params->params[i].key, "q") == 0) { - raw_query = params->params[i].value; - } else if (strcmp(params->params[i].key, "p") == 0) { - int parsed = atoi(params->params[i].value); - if (parsed > 1) page = parsed; - } + for (int i = 0; i < params->count; i++) { + if (strcmp(params->params[i].key, "q") == 0) { + raw_query = params->params[i].value; + } else if (strcmp(params->params[i].key, "p") == 0) { + int parsed = atoi(params->params[i].value); + if (parsed > 1) page = parsed; } } + } context_set(&ctx, "query", raw_query); @@ -87,7 +87,7 @@ int images_handler(UrlParams *params) { snprintf(page_str, sizeof(page_str), "%d", page); snprintf(prev_str, sizeof(prev_str), "%d", page > 1 ? page - 1 : 0); snprintf(next_str, sizeof(next_str), "%d", page + 1); - context_set(&ctx, "page", page_str); + context_set(&ctx, "page", page_str); context_set(&ctx, "prev_page", prev_str); context_set(&ctx, "next_page", next_str); @@ -95,198 +95,198 @@ int images_handler(UrlParams *params) { context_set(&ctx, "query", display_query); if (!raw_query || strlen(raw_query) == 0) { - send_response("<h1>No query provided</h1>"); - if (display_query) free(display_query); - free_context(&ctx); - return -1; + send_response("<h1>No query provided</h1>"); + if (display_query) free(display_query); + free_context(&ctx); + return -1; } CURL *tmp = curl_easy_init(); if (!tmp) { - send_response("<h1>Error initializing curl</h1>"); - if (display_query) free(display_query); - free_context(&ctx); - return -1; + send_response("<h1>Error initializing curl</h1>"); + if (display_query) free(display_query); + free_context(&ctx); + return -1; } char *encoded_query = curl_easy_escape(tmp, raw_query, 0); curl_easy_cleanup(tmp); if (!encoded_query) { - send_response("<h1>Error encoding query</h1>"); - if (display_query) free(display_query); - free_context(&ctx); - return -1; + send_response("<h1>Error encoding query</h1>"); + if (display_query) free(display_query); + free_context(&ctx); + return -1; } char url[1024]; int first = (page - 1) * 32 + 1; snprintf(url, sizeof(url), - "https://www.bing.com/images/search?q=%s&first=%d", encoded_query, first); + "https://www.bing.com/images/search?q=%s&first=%d", encoded_query, first); char *html = fetch_images_html(url); if (!html) { - send_response("<h1>Error fetching images</h1>"); - free(encoded_query); - free(display_query); - free_context(&ctx); - return -1; + send_response("<h1>Error fetching images</h1>"); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; } htmlDocPtr doc = htmlReadMemory(html, (int)strlen(html), NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR); + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR); if (!doc) { - free(html); - free(encoded_query); - free(display_query); - free_context(&ctx); - return -1; + free(html); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; } xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); if (!xpathCtx) { - xmlFreeDoc(doc); - free(html); - free(encoded_query); - free(display_query); - free_context(&ctx); - return -1; + xmlFreeDoc(doc); + free(html); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; } xmlXPathObjectPtr xpathObj = - xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx); + xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx); int image_count = 0; char ***image_matrix = NULL; int *inner_counts = NULL; if (xpathObj && xpathObj->nodesetval) { - int nodes = xpathObj->nodesetval->nodeNr; - - int max_images = (nodes < 32) ? nodes : 32; - image_matrix = malloc(sizeof(char **) * max_images); - inner_counts = malloc(sizeof(int) * max_images); - - for (int i = 0; i < nodes; i++) { - if (image_count >= 32) break; - - xmlNodePtr node = xpathObj->nodesetval->nodeTab[i]; - xmlNodePtr img_node = NULL; - xmlNodePtr tit_node = NULL; - xmlNodePtr des_node = NULL; - xmlNodePtr thumb_link = NULL; - - for (xmlNodePtr child = node->children; child; child = child->next) { - if (child->type != XML_ELEMENT_NODE) continue; - - if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) { - xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); - if (class) { - if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) { - thumb_link = child; - for (xmlNodePtr thumb_child = child->children; thumb_child; thumb_child = thumb_child->next) { - if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) { - xmlChar *div_class = xmlGetProp(thumb_child, (const xmlChar *)"class"); - if (div_class && xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) { - for (xmlNodePtr cico_child = thumb_child->children; cico_child; cico_child = cico_child->next) { - if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") == 0) { - img_node = cico_child; - break; - } - } - } - if (div_class) xmlFree(div_class); - } - } - } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) { - tit_node = child; + int nodes = xpathObj->nodesetval->nodeNr; + + int max_images = (nodes < 32) ? nodes : 32; + image_matrix = malloc(sizeof(char **) * max_images); + inner_counts = malloc(sizeof(int) * max_images); + + for (int i = 0; i < nodes; i++) { + if (image_count >= 32) break; + + xmlNodePtr node = xpathObj->nodesetval->nodeTab[i]; + xmlNodePtr img_node = NULL; + xmlNodePtr tit_node = NULL; + xmlNodePtr des_node = NULL; + xmlNodePtr thumb_link = NULL; + + for (xmlNodePtr child = node->children; child; child = child->next) { + if (child->type != XML_ELEMENT_NODE) continue; + + if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) { + xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); + if (class) { + if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) { + thumb_link = child; + for (xmlNodePtr thumb_child = child->children; thumb_child; thumb_child = thumb_child->next) { + if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) { + xmlChar *div_class = xmlGetProp(thumb_child, (const xmlChar *)"class"); + if (div_class && xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) { + for (xmlNodePtr cico_child = thumb_child->children; cico_child; cico_child = cico_child->next) { + if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") == 0) { + img_node = cico_child; + break; } - xmlFree(class); } - } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) { - xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); - if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) { - for (xmlNodePtr meta_child = child->children; meta_child; meta_child = meta_child->next) { - if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) { - xmlChar *div_class = xmlGetProp(meta_child, (const xmlChar *)"class"); - if (div_class) { - if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) { - des_node = meta_child; - } - xmlFree(div_class); - } - } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) { - xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class"); - if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) { - tit_node = meta_child; - } - if (a_class) xmlFree(a_class); - } - } } - if (class) xmlFree(class); + if (div_class) xmlFree(div_class); } + } + } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) { + tit_node = child; } - - xmlChar *iurl = img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL; - xmlChar *full_url = thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL; - xmlChar *title = des_node ? xmlNodeGetContent(des_node) : (tit_node ? xmlNodeGetContent(tit_node) : NULL); - xmlChar *rurl = tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL; - - if (iurl && strlen((char *)iurl) > 0) { - char *proxy_url = NULL; - CURL *esc_curl = curl_easy_init(); - if (esc_curl) { - char *encoded = curl_easy_escape(esc_curl, (char *)iurl, 0); - if (encoded) { - size_t proxy_len = strlen("/proxy?url=") + strlen(encoded) + 1; - proxy_url = malloc(proxy_len); - if (proxy_url) { - snprintf(proxy_url, proxy_len, "/proxy?url=%s", encoded); - } - curl_free(encoded); + xmlFree(class); + } + } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) { + xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); + if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) { + for (xmlNodePtr meta_child = child->children; meta_child; meta_child = meta_child->next) { + if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) { + xmlChar *div_class = xmlGetProp(meta_child, (const xmlChar *)"class"); + if (div_class) { + if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) { + des_node = meta_child; } - curl_easy_cleanup(esc_curl); + xmlFree(div_class); } + } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) { + xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class"); + if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) { + tit_node = meta_child; + } + if (a_class) xmlFree(a_class); + } + } + } + if (class) xmlFree(class); + } + } - image_matrix[image_count] = malloc(sizeof(char *) * 4); - image_matrix[image_count][0] = proxy_url ? strdup(proxy_url) : strdup((char *)iurl); - image_matrix[image_count][1] = strdup(title ? (char *)title : "Image"); - image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#"); - image_matrix[image_count][3] = strdup(full_url ? (char *)full_url : "#"); - inner_counts[image_count] = 4; - image_count++; + xmlChar *iurl = img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL; + xmlChar *full_url = thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL; + xmlChar *title = des_node ? xmlNodeGetContent(des_node) : (tit_node ? xmlNodeGetContent(tit_node) : NULL); + xmlChar *rurl = tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL; + + if (iurl && strlen((char *)iurl) > 0) { + char *proxy_url = NULL; + CURL *esc_curl = curl_easy_init(); + if (esc_curl) { + char *encoded = curl_easy_escape(esc_curl, (char *)iurl, 0); + if (encoded) { + size_t proxy_len = strlen("/proxy?url=") + strlen(encoded) + 1; + proxy_url = malloc(proxy_len); + if (proxy_url) { + snprintf(proxy_url, proxy_len, "/proxy?url=%s", encoded); } + curl_free(encoded); + } + curl_easy_cleanup(esc_curl); + } - if (iurl) xmlFree(iurl); - if (title) xmlFree(title); - if (rurl) xmlFree(rurl); - if (full_url) xmlFree(full_url); + image_matrix[image_count] = malloc(sizeof(char *) * 4); + image_matrix[image_count][0] = proxy_url ? strdup(proxy_url) : strdup((char *)iurl); + image_matrix[image_count][1] = strdup(title ? (char *)title : "Image"); + image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#"); + image_matrix[image_count][3] = strdup(full_url ? (char *)full_url : "#"); + inner_counts[image_count] = 4; + image_count++; } + + if (iurl) xmlFree(iurl); + if (title) xmlFree(title); + if (rurl) xmlFree(rurl); + if (full_url) xmlFree(full_url); + } } context_set_array_of_arrays(&ctx, "images", image_matrix, image_count, - inner_counts); + inner_counts); char *rendered = render_template("images.html", &ctx); if (rendered) { - send_response(rendered); - free(rendered); + send_response(rendered); + free(rendered); } else { - send_response("<h1>Error rendering image results</h1>"); + send_response("<h1>Error rendering image results</h1>"); } if (image_matrix) { - for (int i = 0; i < image_count; i++) { - for (int j = 0; j < 4; j++) { - free(image_matrix[i][j]); - } - free(image_matrix[i]); + for (int i = 0; i < image_count; i++) { + for (int j = 0; j < 4; j++) { + free(image_matrix[i][j]); } - free(image_matrix); + free(image_matrix[i]); + } + free(image_matrix); } if (inner_counts) { - free(inner_counts); + free(inner_counts); } if (xpathObj) xmlXPathFreeObject(xpathObj); diff --git a/src/Routes/Search.c b/src/Routes/Search.c index dee7a9f..51fe415 100644 --- a/src/Routes/Search.c +++ b/src/Routes/Search.c @@ -23,13 +23,13 @@ static void *wiki_thread_func(void *arg) { InfoBoxThreadData *data = (InfoBoxThreadData *)arg; char *dynamic_url = construct_wiki_url(data->query); if (dynamic_url) { - data->result = fetch_wiki_data(dynamic_url); - data->success = - (data->result.title != NULL && data->result.extract != NULL && - strlen(data->result.extract) > 10); - free(dynamic_url); + data->result = fetch_wiki_data(dynamic_url); + data->success = + (data->result.title != NULL && data->result.extract != NULL && + strlen(data->result.extract) > 10); + free(dynamic_url); } else { - data->success = 0; + data->success = 0; } return NULL; } @@ -41,44 +41,44 @@ static int is_calculator_query(const char *query) { int has_math_operator = 0; for (const char *p = query; *p; p++) { - if (isdigit(*p) || *p == '.') { - has_digit = 1; - } - if (*p == '+' || *p == '-' || *p == '*' || *p == '/' || *p == '^') { - has_math_operator = 1; - } + if (isdigit(*p) || *p == '.') { + has_digit = 1; + } + if (*p == '+' || *p == '-' || *p == '*' || *p == '/' || *p == '^') { + has_math_operator = 1; + } } if (!has_digit || !has_math_operator) return 0; int len = strlen(query); for (int i = 0; i < len; i++) { - char c = query[i]; - if (c == '+' || c == '-' || c == '*' || c == '/' || c == '^') { - int has_num_before = 0; - int has_num_after = 0; - - for (int j = i - 1; j >= 0; j--) { - if (isdigit(query[j]) || query[j] == '.') { - has_num_before = 1; - break; - } - if (query[j] != ' ') break; - } + char c = query[i]; + if (c == '+' || c == '-' || c == '*' || c == '/' || c == '^') { + int has_num_before = 0; + int has_num_after = 0; + + for (int j = i - 1; j >= 0; j--) { + if (isdigit(query[j]) || query[j] == '.') { + has_num_before = 1; + break; + } + if (query[j] != ' ') break; + } - for (int j = i + 1; j < len; j++) { - if (isdigit(query[j]) || query[j] == '.') { - has_num_after = 1; - break; - } - if (query[j] != ' ') break; - } + for (int j = i + 1; j < len; j++) { + if (isdigit(query[j]) || query[j] == '.') { + has_num_after = 1; + break; + } + if (query[j] != ' ') break; + } - if (has_num_before || has_num_after) { - return 1; - } + if (has_num_before || has_num_after) { + return 1; } } + } return 0; } @@ -87,11 +87,11 @@ static void *calc_thread_func(void *arg) { InfoBoxThreadData *data = (InfoBoxThreadData *)arg; if (is_calculator_query(data->query)) { - data->result = fetch_calc_data((char *)data->query); - data->success = - (data->result.title != NULL && data->result.extract != NULL); + data->result = fetch_calc_data((char *)data->query); + data->success = + (data->result.title != NULL && data->result.extract != NULL); } else { - data->success = 0; + data->success = 0; } return NULL; @@ -101,11 +101,11 @@ static void *dict_thread_func(void *arg) { InfoBoxThreadData *data = (InfoBoxThreadData *)arg; if (is_dictionary_query(data->query)) { - data->result = fetch_dictionary_data(data->query); - data->success = - (data->result.title != NULL && data->result.extract != NULL); + data->result = fetch_dictionary_data(data->query); + data->success = + (data->result.title != NULL && data->result.extract != NULL); } else { - data->success = 0; + data->success = 0; } return NULL; @@ -115,22 +115,22 @@ static void *unit_thread_func(void *arg) { InfoBoxThreadData *data = (InfoBoxThreadData *)arg; if (is_unit_conv_query(data->query)) { - data->result = fetch_unit_conv_data(data->query); - data->success = - (data->result.title != NULL && data->result.extract != NULL); + data->result = fetch_unit_conv_data(data->query); + data->success = + (data->result.title != NULL && data->result.extract != NULL); } else { - data->success = 0; + data->success = 0; } return NULL; } static int add_infobox_to_collection(InfoBox *infobox, char ****collection, - int **inner_counts, int current_count) { + int **inner_counts, int current_count) { *collection = - (char ***)realloc(*collection, sizeof(char **) * (current_count + 1)); + (char ***)realloc(*collection, sizeof(char **) * (current_count + 1)); *inner_counts = - (int *)realloc(*inner_counts, sizeof(int) * (current_count + 1)); + (int *)realloc(*inner_counts, sizeof(int) * (current_count + 1)); (*collection)[current_count] = (char **)malloc(sizeof(char *) * 4); (*collection)[current_count][0] = infobox->title ? strdup(infobox->title) : NULL; @@ -148,15 +148,15 @@ int results_handler(UrlParams *params) { int page = 1; if (params) { - for (int i = 0; i < params->count; i++) { - if (strcmp(params->params[i].key, "q") == 0) { - raw_query = params->params[i].value; - } else if (strcmp(params->params[i].key, "p") == 0) { - int parsed = atoi(params->params[i].value); - if (parsed > 1) page = parsed; - } + for (int i = 0; i < params->count; i++) { + if (strcmp(params->params[i].key, "q") == 0) { + raw_query = params->params[i].value; + } else if (strcmp(params->params[i].key, "p") == 0) { + int parsed = atoi(params->params[i].value); + if (parsed > 1) page = parsed; } } + } context_set(&ctx, "query", raw_query); @@ -164,14 +164,14 @@ int results_handler(UrlParams *params) { snprintf(page_str, sizeof(page_str), "%d", page); snprintf(prev_str, sizeof(prev_str), "%d", page > 1 ? page - 1 : 0); snprintf(next_str, sizeof(next_str), "%d", page + 1); - context_set(&ctx, "page", page_str); + context_set(&ctx, "page", page_str); context_set(&ctx, "prev_page", prev_str); context_set(&ctx, "next_page", next_str); if (!raw_query || strlen(raw_query) == 0) { - send_response("<h1>No query provided</h1>"); - free_context(&ctx); - return -1; + send_response("<h1>No query provided</h1>"); + free_context(&ctx); + return -1; } pthread_t wiki_tid, calc_tid, dict_tid, unit_tid; @@ -181,36 +181,36 @@ int results_handler(UrlParams *params) { InfoBoxThreadData unit_data = {.query = raw_query, .success = 0}; if (page == 1) { - pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data); - pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data); - pthread_create(&dict_tid, NULL, dict_thread_func, &dict_data); - pthread_create(&unit_tid, NULL, unit_thread_func, &unit_data); + pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data); + pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data); + pthread_create(&dict_tid, NULL, dict_thread_func, &dict_data); + pthread_create(&unit_tid, NULL, unit_thread_func, &unit_data); } ScrapeJob jobs[ENGINE_COUNT]; SearchResult *all_results[ENGINE_COUNT]; for (int i = 0; i < ENGINE_COUNT; i++) { - all_results[i] = NULL; - jobs[i].engine = &ENGINE_REGISTRY[i]; - jobs[i].query = raw_query; - jobs[i].out_results = &all_results[i]; - jobs[i].max_results = 10; - jobs[i].results_count = 0; - jobs[i].page = page; - jobs[i].handle = NULL; - jobs[i].response.memory = NULL; - jobs[i].response.size = 0; - jobs[i].response.capacity = 0; + all_results[i] = NULL; + jobs[i].engine = &ENGINE_REGISTRY[i]; + jobs[i].query = raw_query; + jobs[i].out_results = &all_results[i]; + jobs[i].max_results = 10; + jobs[i].results_count = 0; + jobs[i].page = page; + jobs[i].handle = NULL; + jobs[i].response.memory = NULL; + jobs[i].response.size = 0; + jobs[i].response.capacity = 0; } scrape_engines_parallel(jobs, ENGINE_COUNT); if (page == 1) { - pthread_join(wiki_tid, NULL); - pthread_join(calc_tid, NULL); - pthread_join(dict_tid, NULL); - pthread_join(unit_tid, NULL); + pthread_join(wiki_tid, NULL); + pthread_join(calc_tid, NULL); + pthread_join(dict_tid, NULL); + pthread_join(unit_tid, NULL); } char ***infobox_matrix = NULL; @@ -218,118 +218,118 @@ int results_handler(UrlParams *params) { int infobox_count = 0; if (page == 1) { - if (dict_data.success) { - infobox_count = add_infobox_to_collection(&dict_data.result, &infobox_matrix, - &infobox_inner_counts, infobox_count); - } + if (dict_data.success) { + infobox_count = add_infobox_to_collection(&dict_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } - if (calc_data.success) { - infobox_count = add_infobox_to_collection(&calc_data.result, &infobox_matrix, - &infobox_inner_counts, infobox_count); - } + if (calc_data.success) { + infobox_count = add_infobox_to_collection(&calc_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } - if (unit_data.success) { - infobox_count = add_infobox_to_collection(&unit_data.result, &infobox_matrix, - &infobox_inner_counts, infobox_count); - } + if (unit_data.success) { + infobox_count = add_infobox_to_collection(&unit_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } - if (wiki_data.success) { - infobox_count = add_infobox_to_collection(&wiki_data.result, &infobox_matrix, - &infobox_inner_counts, infobox_count); - } + if (wiki_data.success) { + infobox_count = add_infobox_to_collection(&wiki_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } } if (infobox_count > 0) { - context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix, - infobox_count, infobox_inner_counts); - for (int i = 0; i < infobox_count; i++) { - for (int j = 0; j < 4; j++) free(infobox_matrix[i][j]); - free(infobox_matrix[i]); - } - free(infobox_matrix); - free(infobox_inner_counts); + context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix, + infobox_count, infobox_inner_counts); + for (int i = 0; i < infobox_count; i++) { + for (int j = 0; j < 4; j++) free(infobox_matrix[i][j]); + free(infobox_matrix[i]); + } + free(infobox_matrix); + free(infobox_inner_counts); } int total_results = 0; for (int i = 0; i < ENGINE_COUNT; i++) { - total_results += jobs[i].results_count; + total_results += jobs[i].results_count; } if (total_results > 0) { - char ***results_matrix = (char ***)malloc(sizeof(char **) * total_results); - int *results_inner_counts = (int *)malloc(sizeof(int) * total_results); - char **seen_urls = (char **)malloc(sizeof(char *) * total_results); - int unique_count = 0; - - for (int i = 0; i < ENGINE_COUNT; i++) { - for (int j = 0; j < jobs[i].results_count; j++) { - char *display_url = all_results[i][j].url; - - int is_duplicate = 0; - for (int k = 0; k < unique_count; k++) { - if (strcmp(seen_urls[k], display_url) == 0) { - is_duplicate = 1; - break; - } - } - - if (is_duplicate) { - free(all_results[i][j].url); - free(all_results[i][j].title); - free(all_results[i][j].snippet); - continue; - } - - seen_urls[unique_count] = strdup(display_url); - results_matrix[unique_count] = (char **)malloc(sizeof(char *) * 4); - char *pretty_url = pretty_display_url(display_url); - - results_matrix[unique_count][0] = strdup(display_url); - results_matrix[unique_count][1] = strdup(pretty_url); - results_matrix[unique_count][2] = all_results[i][j].title ? strdup(all_results[i][j].title) : strdup("Untitled"); - results_matrix[unique_count][3] = all_results[i][j].snippet ? strdup(all_results[i][j].snippet) : strdup(""); - - results_inner_counts[unique_count] = 4; - - free(pretty_url); - free(all_results[i][j].url); - free(all_results[i][j].title); - free(all_results[i][j].snippet); - - unique_count++; + char ***results_matrix = (char ***)malloc(sizeof(char **) * total_results); + int *results_inner_counts = (int *)malloc(sizeof(int) * total_results); + char **seen_urls = (char **)malloc(sizeof(char *) * total_results); + int unique_count = 0; + + for (int i = 0; i < ENGINE_COUNT; i++) { + for (int j = 0; j < jobs[i].results_count; j++) { + char *display_url = all_results[i][j].url; + + int is_duplicate = 0; + for (int k = 0; k < unique_count; k++) { + if (strcmp(seen_urls[k], display_url) == 0) { + is_duplicate = 1; + break; } - free(all_results[i]); } - context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count, results_inner_counts); - - char *html = render_template("results.html", &ctx); - if (html) { - send_response(html); - free(html); + if (is_duplicate) { + free(all_results[i][j].url); + free(all_results[i][j].title); + free(all_results[i][j].snippet); + continue; } - for (int i = 0; i < unique_count; i++) { - for (int j = 0; j < 4; j++) free(results_matrix[i][j]); - free(results_matrix[i]); - free(seen_urls[i]); + seen_urls[unique_count] = strdup(display_url); + results_matrix[unique_count] = (char **)malloc(sizeof(char *) * 4); + char *pretty_url = pretty_display_url(display_url); + + results_matrix[unique_count][0] = strdup(display_url); + results_matrix[unique_count][1] = strdup(pretty_url); + results_matrix[unique_count][2] = all_results[i][j].title ? strdup(all_results[i][j].title) : strdup("Untitled"); + results_matrix[unique_count][3] = all_results[i][j].snippet ? strdup(all_results[i][j].snippet) : strdup(""); + + results_inner_counts[unique_count] = 4; + + free(pretty_url); + free(all_results[i][j].url); + free(all_results[i][j].title); + free(all_results[i][j].snippet); + + unique_count++; } - free(seen_urls); - free(results_matrix); - free(results_inner_counts); + free(all_results[i]); + } + + context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count, results_inner_counts); + + char *html = render_template("results.html", &ctx); + if (html) { + send_response(html); + free(html); + } + + for (int i = 0; i < unique_count; i++) { + for (int j = 0; j < 4; j++) free(results_matrix[i][j]); + free(results_matrix[i]); + free(seen_urls[i]); + } + free(seen_urls); + free(results_matrix); + free(results_inner_counts); } else { - char *html = render_template("results.html", &ctx); - if (html) { - send_response(html); - free(html); - } + char *html = render_template("results.html", &ctx); + if (html) { + send_response(html); + free(html); + } } if (page == 1) { - if (wiki_data.success) free_infobox(&wiki_data.result); - if (calc_data.success) free_infobox(&calc_data.result); - if (dict_data.success) free_infobox(&dict_data.result); - if (unit_data.success) free_infobox(&unit_data.result); + if (wiki_data.success) free_infobox(&wiki_data.result); + if (calc_data.success) free_infobox(&calc_data.result); + if (dict_data.success) free_infobox(&dict_data.result); + if (unit_data.success) free_infobox(&unit_data.result); } free_context(&ctx); diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index 5b1b5d6..97f40a8 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -11,21 +11,21 @@ #include <unistd.h> static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, - void *userp) { + void *userp) { size_t realsize = size * nmemb; MemoryBuffer *mem = (MemoryBuffer *)userp; if (mem->size + realsize + 1 > mem->capacity) { - size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2; - while (new_cap < mem->size + realsize + 1) new_cap *= 2; + size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2; + while (new_cap < mem->size + realsize + 1) new_cap *= 2; - char *ptr = (char *)realloc(mem->memory, new_cap); - if (!ptr) { - return 0; - } - mem->memory = ptr; - mem->capacity = new_cap; + char *ptr = (char *)realloc(mem->memory, new_cap); + if (!ptr) { + return 0; + } + mem->memory = ptr; + mem->capacity = new_cap; } memcpy(&(mem->memory[mem->size]), contents, realsize); @@ -37,37 +37,37 @@ static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, static const char *get_random_user_agent() { static const char *agents[] = { - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " - "like Gecko) Chrome/120.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " - "Gecko) " - "Chrome/120.0.0.0` Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 " - "Firefox/121.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " - "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"}; + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " + "Gecko) " + "Chrome/120.0.0.0` Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 " + "Firefox/121.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"}; return agents[rand() % 5]; } static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { + SearchResult **out_results, int max_results) { (void)engine_name; int found_count = 0; xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); if (!xpathCtx) { - return 0; + return 0; } const char *link_xpath = "//tr[not(contains(@class, 'result-sponsored'))]//a[@class='result-link']"; xmlXPathObjectPtr xpathObj = - xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx); + xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx); if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; } int num_links = xpathObj->nodesetval->nodeNr; @@ -75,49 +75,49 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, int actual_alloc = (num_links < max_results) ? num_links : max_results; *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; } for (int i = 0; i < num_links && found_count < max_results; i++) { - xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i]; - char *title = (char *)xmlNodeGetContent(linkNode); - char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href"); - char *snippet_text = NULL; - - xmlNodePtr current = linkNode->parent; - while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) - current = current->parent; - - if (current && current->next) { - xmlNodePtr snippetRow = current->next; - while (snippetRow && - xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) - snippetRow = snippetRow->next; - if (snippetRow) { - - xpathCtx->node = snippetRow; - xmlXPathObjectPtr sObj = xmlXPathEvalExpression( - (xmlChar *)".//td[@class='result-snippet']", xpathCtx); - if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { - snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); - } - if (sObj) xmlXPathFreeObject(sObj); - xpathCtx->node = NULL; + xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i]; + char *title = (char *)xmlNodeGetContent(linkNode); + char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href"); + char *snippet_text = NULL; + + xmlNodePtr current = linkNode->parent; + while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) + current = current->parent; + + if (current && current->next) { + xmlNodePtr snippetRow = current->next; + while (snippetRow && + xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) + snippetRow = snippetRow->next; + if (snippetRow) { + + xpathCtx->node = snippetRow; + xmlXPathObjectPtr sObj = xmlXPathEvalExpression( + (xmlChar *)".//td[@class='result-snippet']", xpathCtx); + if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { + snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); + } + if (sObj) xmlXPathFreeObject(sObj); + xpathCtx->node = NULL; - } } + } - (*out_results)[found_count].url = unescape_search_url(url); - (*out_results)[found_count].title = strdup(title ? title : "No Title"); - (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); + (*out_results)[found_count].url = unescape_search_url(url); + (*out_results)[found_count].title = strdup(title ? title : "No Title"); + (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); - found_count++; + found_count++; - if (title) xmlFree(title); - if (url) xmlFree(url); - if (snippet_text) xmlFree(snippet_text); + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); } xmlXPathFreeObject(xpathObj); @@ -126,22 +126,22 @@ static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, } static int parse_startpage(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { + SearchResult **out_results, int max_results) { (void)engine_name; int found_count = 0; xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); if (!xpathCtx) { - return 0; + return 0; } const char *container_xpath = "//div[contains(@class, 'result')]"; xmlXPathObjectPtr xpathObj = - xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; } int num_results = xpathObj->nodesetval->nodeNr; @@ -149,52 +149,52 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, int actual_alloc = (num_results < max_results) ? num_results : max_results; *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; } for (int i = 0; i < num_results && found_count < max_results; i++) { - xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xpathCtx->node = resultNode; - - xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( - (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx); - char *url = - (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) - ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], - (xmlChar *)"href") - : NULL; - - xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( - (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx); - char *title = - (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) - ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) - : NULL; - - xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( - (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx); - char *snippet_text = - (snippetObj && snippetObj->nodesetval && - snippetObj->nodesetval->nodeNr > 0) - ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) - : NULL; - - if (url && title) { - (*out_results)[found_count].url = strdup(url); - (*out_results)[found_count].title = strdup(title); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); - found_count++; - } + xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; + xpathCtx->node = resultNode; + + xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( + (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx); + char *url = + (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( + (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx); + char *title = + (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( + (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx); + char *snippet_text = + (snippetObj && snippetObj->nodesetval && + snippetObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + (*out_results)[found_count].url = strdup(url); + (*out_results)[found_count].title = strdup(title); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + found_count++; + } - if (title) xmlFree(title); - if (url) xmlFree(url); - if (snippet_text) xmlFree(snippet_text); - if (linkObj) xmlXPathFreeObject(linkObj); - if (titleObj) xmlXPathFreeObject(titleObj); - if (snippetObj) xmlXPathFreeObject(snippetObj); + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + if (linkObj) xmlXPathFreeObject(linkObj); + if (titleObj) xmlXPathFreeObject(titleObj); + if (snippetObj) xmlXPathFreeObject(snippetObj); } xpathCtx->node = NULL; @@ -205,22 +205,22 @@ static int parse_startpage(const char *engine_name, xmlDocPtr doc, } static int parse_yahoo(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results) { + SearchResult **out_results, int max_results) { (void)engine_name; int found_count = 0; xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); if (!xpathCtx) { - return 0; + return 0; } const char *container_xpath = "//div[contains(@class, 'algo-sr')]"; xmlXPathObjectPtr xpathObj = - xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { - if (xpathObj) xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; } int num_results = xpathObj->nodesetval->nodeNr; @@ -228,53 +228,53 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, int actual_alloc = (num_results < max_results) ? num_results : max_results; *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); if (!*out_results) { - xmlXPathFreeObject(xpathObj); - xmlXPathFreeContext(xpathCtx); - return 0; + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; } for (int i = 0; i < num_results && found_count < max_results; i++) { - xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; - xpathCtx->node = resultNode; - - xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( - (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']", - xpathCtx); - char *url = - (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) - ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], - (xmlChar *)"href") - : NULL; - - xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( - (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx); - char *title = - (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) - ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) - : NULL; - - xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( - (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx); - char *snippet_text = - (snippetObj && snippetObj->nodesetval && - snippetObj->nodesetval->nodeNr > 0) - ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) - : NULL; - - if (url && title) { - (*out_results)[found_count].url = unescape_search_url(url); - (*out_results)[found_count].title = strdup(title); - (*out_results)[found_count].snippet = - strdup(snippet_text ? snippet_text : ""); - found_count++; - } + xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; + xpathCtx->node = resultNode; + + xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( + (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']", + xpathCtx); + char *url = + (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( + (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx); + char *title = + (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( + (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx); + char *snippet_text = + (snippetObj && snippetObj->nodesetval && + snippetObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + (*out_results)[found_count].url = unescape_search_url(url); + (*out_results)[found_count].title = strdup(title); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + found_count++; + } - if (title) xmlFree(title); - if (url) xmlFree(url); - if (snippet_text) xmlFree(snippet_text); - if (linkObj) xmlXPathFreeObject(linkObj); - if (titleObj) xmlXPathFreeObject(titleObj); - if (snippetObj) xmlXPathFreeObject(snippetObj); + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + if (linkObj) xmlXPathFreeObject(linkObj); + if (titleObj) xmlXPathFreeObject(titleObj); + if (snippetObj) xmlXPathFreeObject(snippetObj); } xpathCtx->node = NULL; @@ -284,36 +284,36 @@ static int parse_yahoo(const char *engine_name, xmlDocPtr doc, } const SearchEngine ENGINE_REGISTRY[] = { - {.name = "DuckDuckGo Lite", - .base_url = "https://lite.duckduckgo.com/lite/?q=", - .host_header = "lite.duckduckgo.com", - .referer = "https://lite.duckduckgo.com/", - .page_param = "s", - .page_multiplier = 30, - .page_base = 0, - .parser = parse_ddg_lite}, - {.name = "Startpage", - .base_url = "https://www.startpage.com/sp/search?q=", - .host_header = "www.startpage.com", - .referer = "https://www.startpage.com/", - .page_param = "page", - .page_multiplier = 1, - .page_base = 1, - .parser = parse_startpage}, - {.name = "Yahoo", - .base_url = "https://search.yahoo.com/search?p=", - .host_header = "search.yahoo.com", - .referer = "https://search.yahoo.com/", - .page_param = "b", - .page_multiplier = 10, - .page_base = 1, - .parser = parse_yahoo}}; + {.name = "DuckDuckGo Lite", + .base_url = "https://lite.duckduckgo.com/lite/?q=", + .host_header = "lite.duckduckgo.com", + .referer = "https://lite.duckduckgo.com/", + .page_param = "s", + .page_multiplier = 30, + .page_base = 0, + .parser = parse_ddg_lite}, + {.name = "Startpage", + .base_url = "https://www.startpage.com/sp/search?q=", + .host_header = "www.startpage.com", + .referer = "https://www.startpage.com/", + .page_param = "page", + .page_multiplier = 1, + .page_base = 1, + .parser = parse_startpage}, + {.name = "Yahoo", + .base_url = "https://search.yahoo.com/search?p=", + .host_header = "search.yahoo.com", + .referer = "https://search.yahoo.com/", + .page_param = "b", + .page_multiplier = 10, + .page_base = 1, + .parser = parse_yahoo}}; const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); static void configure_curl_handle(CURL *curl, const char *full_url, - MemoryBuffer *chunk, - struct curl_slist *headers) { + MemoryBuffer *chunk, + struct curl_slist *headers) { curl_easy_setopt(curl, CURLOPT_URL, full_url); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); @@ -340,62 +340,62 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { retry: CURLM *multi_handle = curl_multi_init(); if (!multi_handle) { - return -1; + return -1; } for (int i = 0; i < num_jobs; i++) { - ScrapeJob *job = &jobs[i]; + ScrapeJob *job = &jobs[i]; - if (job->handle) { - curl_easy_cleanup(job->handle); - job->handle = NULL; - } - if (job->response.memory) { - free(job->response.memory); - } + if (job->handle) { + curl_easy_cleanup(job->handle); + job->handle = NULL; + } + if (job->response.memory) { + free(job->response.memory); + } - job->handle = curl_easy_init(); - if (!job->handle) { - continue; - } + job->handle = curl_easy_init(); + if (!job->handle) { + continue; + } - job->response.memory = (char *)malloc(16384); - job->response.size = 0; - job->response.capacity = 16384; + job->response.memory = (char *)malloc(16384); + job->response.size = 0; + job->response.capacity = 16384; - char full_url[1024]; - char *encoded_query = curl_easy_escape(job->handle, job->query, 0); - if (!encoded_query) { - curl_easy_cleanup(job->handle); - job->handle = NULL; - continue; - } + char full_url[1024]; + char *encoded_query = curl_easy_escape(job->handle, job->query, 0); + if (!encoded_query) { + curl_easy_cleanup(job->handle); + job->handle = NULL; + continue; + } - int page = (job->page < 1) ? 1 : job->page; - int page_value = (page - 1) * job->engine->page_multiplier + job->engine->page_base; + int page = (job->page < 1) ? 1 : job->page; + int page_value = (page - 1) * job->engine->page_multiplier + job->engine->page_base; - snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", - job->engine->base_url, - encoded_query, - job->engine->page_param, - page_value); - curl_free(encoded_query); + snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", + job->engine->base_url, + encoded_query, + job->engine->page_param, + page_value); + curl_free(encoded_query); - struct curl_slist *headers = NULL; - char host_buf[256], ref_buf[256]; - snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header); - snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer); - headers = curl_slist_append(headers, host_buf); - headers = curl_slist_append(headers, ref_buf); - headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); - headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); - headers = curl_slist_append(headers, "DNT: 1"); + struct curl_slist *headers = NULL; + char host_buf[256], ref_buf[256]; + snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header); + snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer); + headers = curl_slist_append(headers, host_buf); + headers = curl_slist_append(headers, ref_buf); + headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); + headers = curl_slist_append(headers, "DNT: 1"); - configure_curl_handle(job->handle, full_url, &job->response, headers); + configure_curl_handle(job->handle, full_url, &job->response, headers); - curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); + curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); - curl_multi_add_handle(multi_handle, job->handle); + curl_multi_add_handle(multi_handle, job->handle); } usleep(100000 + (rand() % 100000)); @@ -404,86 +404,86 @@ retry: curl_multi_perform(multi_handle, &still_running); do { - int numfds = 0; - CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); + int numfds = 0; + CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); - if (mc != CURLM_OK) { - break; - } + if (mc != CURLM_OK) { + break; + } - curl_multi_perform(multi_handle, &still_running); + curl_multi_perform(multi_handle, &still_running); } while (still_running); CURLMsg *msg; int msgs_left; while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { - if (msg->msg == CURLMSG_DONE) { - CURL *handle = msg->easy_handle; - - for (int i = 0; i < num_jobs; i++) { - if (jobs[i].handle && jobs[i].handle == handle) { - ScrapeJob *job = &jobs[i]; - - long response_code; - curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code); - - if (msg->data.result == CURLE_OK && job->response.size > 0) { - xmlDocPtr doc = htmlReadMemory( - job->response.memory, job->response.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); - - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); - } - } else { - job->results_count = 0; - } - - struct curl_slist *headers; - curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); - if (headers) curl_slist_free_all(headers); - - free(job->response.memory); - job->response.memory = NULL; - curl_multi_remove_handle(multi_handle, handle); - if (handle) curl_easy_cleanup(handle); - job->handle = NULL; - break; - } + if (msg->msg == CURLMSG_DONE) { + CURL *handle = msg->easy_handle; + + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].handle && jobs[i].handle == handle) { + ScrapeJob *job = &jobs[i]; + + long response_code; + curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code); + + if (msg->data.result == CURLE_OK && job->response.size > 0) { + xmlDocPtr doc = htmlReadMemory( + job->response.memory, job->response.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); } + } else { + job->results_count = 0; + } + + struct curl_slist *headers; + curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); + if (headers) curl_slist_free_all(headers); + + free(job->response.memory); + job->response.memory = NULL; + curl_multi_remove_handle(multi_handle, handle); + if (handle) curl_easy_cleanup(handle); + job->handle = NULL; + break; + } } } + } curl_multi_cleanup(multi_handle); if (retries < max_proxy_retries && proxy_count > 0) { - int any_failed = 0; - for (int i = 0; i < num_jobs; i++) { - if (jobs[i].results_count == 0 && jobs[i].response.size == 0) { - any_failed = 1; - break; - } - } - if (any_failed) { - retries++; - goto retry; + int any_failed = 0; + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].results_count == 0 && jobs[i].response.size == 0) { + any_failed = 1; + break; } } + if (any_failed) { + retries++; + goto retry; + } + } return 0; } int scrape_engine(const SearchEngine *engine, const char *query, - SearchResult **out_results, int max_results) { + SearchResult **out_results, int max_results) { ScrapeJob job = { - .engine = engine, - .query = (char *)query, - .out_results = out_results, - .max_results = max_results, - .results_count = 0, - .page = 1 + .engine = engine, + .query = (char *)query, + .out_results = out_results, + .max_results = max_results, + .results_count = 0, + .page = 1 }; scrape_engines_parallel(&job, 1); diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h index 0865825..e33f529 100644 --- a/src/Scraping/Scraping.h +++ b/src/Scraping/Scraping.h @@ -11,7 +11,7 @@ typedef struct { } SearchResult; typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc, - SearchResult **out_results, int max_results); + SearchResult **out_results, int max_results); typedef struct { const char *name; @@ -20,8 +20,8 @@ typedef struct { const char *referer; const char *page_param; - int page_multiplier; - int page_base; + int page_multiplier; + int page_base; ParserFunc parser; } SearchEngine; @@ -36,7 +36,7 @@ typedef struct { char *query; SearchResult **out_results; int max_results; - int page; + int page; CURL *handle; MemoryBuffer response; int results_count; @@ -46,7 +46,7 @@ extern const SearchEngine ENGINE_REGISTRY[]; extern const int ENGINE_COUNT; int scrape_engine(const SearchEngine *engine, const char *query, - SearchResult **out_results, int max_results); + SearchResult **out_results, int max_results); int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs); diff --git a/src/Utility/Display.c b/src/Utility/Display.c index 492e998..9737757 100644 --- a/src/Utility/Display.c +++ b/src/Utility/Display.c @@ -5,42 +5,42 @@ #include <strings.h> char *pretty_display_url(const char *input) { - if (!input) return NULL; - - const char *start = input; - - const char *protocol_pos = strstr(input, "://"); - if (protocol_pos) { - start = protocol_pos + 3; - } - - if (strncasecmp(start, "www.", 4) == 0) { - start += 4; - } - - size_t input_len = strlen(start); - char temp[512]; - strncpy(temp, start, sizeof(temp) - 1); - temp[sizeof(temp) - 1] = '\0'; - - if (input_len > 0 && temp[input_len - 1] == '/') { - temp[input_len - 1] = '\0'; - } - - char *output = (char *)malloc(strlen(temp) * 3 + 1); - if (!output) return NULL; - - size_t j = 0; - for (size_t i = 0; temp[i] != '\0'; i++) { - if (temp[i] == '/') { - output[j++] = ' '; - output[j++] = '>'; - output[j++] = ' '; - } else { - output[j++] = (char)tolower((unsigned char)temp[i]); - } + if (!input) return NULL; + + const char *start = input; + + const char *protocol_pos = strstr(input, "://"); + if (protocol_pos) { + start = protocol_pos + 3; + } + + if (strncasecmp(start, "www.", 4) == 0) { + start += 4; + } + + size_t input_len = strlen(start); + char temp[512]; + strncpy(temp, start, sizeof(temp) - 1); + temp[sizeof(temp) - 1] = '\0'; + + if (input_len > 0 && temp[input_len - 1] == '/') { + temp[input_len - 1] = '\0'; + } + + char *output = (char *)malloc(strlen(temp) * 3 + 1); + if (!output) return NULL; + + size_t j = 0; + for (size_t i = 0; temp[i] != '\0'; i++) { + if (temp[i] == '/') { + output[j++] = ' '; + output[j++] = '>'; + output[j++] = ' '; + } else { + output[j++] = (char)tolower((unsigned char)temp[i]); } - output[j] = '\0'; + } + output[j] = '\0'; - return output; + return output; } diff --git a/src/Utility/Utility.c b/src/Utility/Utility.c index 8e5af92..07fa1da 100644 --- a/src/Utility/Utility.c +++ b/src/Utility/Utility.c @@ -1,8 +1,8 @@ #include "Utility.h" int hex_to_int(char c) { - if (c >= '0' && c <= '9') return c - '0'; - if (c >= 'a' && c <= 'f') return c - 'a' + 10; - if (c >= 'A' && c <= 'F') return c - 'A' + 10; - return -1; + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; } |
