diff options
Diffstat (limited to 'src/Infobox/Dictionary.c')
| -rw-r--r-- | src/Infobox/Dictionary.c | 246 |
1 files changed, 246 insertions, 0 deletions
diff --git a/src/Infobox/Dictionary.c b/src/Infobox/Dictionary.c new file mode 100644 index 0000000..a835899 --- /dev/null +++ b/src/Infobox/Dictionary.c @@ -0,0 +1,246 @@ +#include "Dictionary.h" +#include <curl/curl.h> +#include <libxml/HTMLparser.h> +#include <libxml/xpath.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <ctype.h> + +static const char *PREFIXES[] = { + "what is the definition of ", "what's the definition of ", + "what is the meaning of ", "what's the meaning of ", + "what does the word ", "definition of ", "meaning of ", "def of ", + "define ", "definition ", "define:", "def ", "def:", + "what does ", "what is ", "what's ", "whats ", + "meaning ", "dictionary ", "dict ", NULL +}; + +static const char *SUFFIXES[] = { + " definition", " def", " meaning", " mean", " means", + " dictionary", " dict", " define", " defined", + " definition?", " def?", " meaning?", " mean?", " means?", + " in english", " in english?", NULL +}; + +static const char *SKIP_WORDS[] = {"of ", "the ", "a ", "an ", NULL}; + +static const char *strcasestr_impl(const char *haystack, const char *needle) { + if (!haystack || !needle || !*needle) return haystack; + size_t len = strlen(needle); + for (const char *h = haystack; *h; h++) { + if (strncasecmp(h, needle, len) == 0) return h; + } + return NULL; +} + +struct MemStruct { char *memory; size_t size; }; + +static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp) { + size_t realsize = size * nmemb; + struct MemStruct *mem = (struct MemStruct *)userp; + char *ptr = realloc(mem->memory, mem->size + realsize + 1); + if (!ptr) return 0; + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + return realsize; +} + +static char *xpath_text(xmlDocPtr doc, const char *xpath) { + xmlXPathContextPtr ctx = xmlXPathNewContext(doc); + if (!ctx) return NULL; + xmlXPathObjectPtr obj = xmlXPathEvalExpression((const xmlChar *)xpath, ctx); + xmlXPathFreeContext(ctx); + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + if (obj) xmlXPathFreeObject(obj); + return NULL; + } + xmlChar *content = xmlNodeGetContent(obj->nodesetval->nodeTab[0]); + char *result = content ? strdup((char *)content) : NULL; + if (content) xmlFree(content); + xmlXPathFreeObject(obj); + return result; +} + +static char *build_html(const char *word, const char *pron, const char *pos, + const char *def, const char *ex) { + char html[4096]; + int n = snprintf(html, sizeof(html), "<div class='dict-container' style='line-height: 1.6;'>"); + if (word) n += snprintf(html + n, sizeof(html) - n, + "<div style='font-size: 1.3em; font-weight: bold; margin-bottom: 4px;'>%s</div>", word); + if (pron) n += snprintf(html + n, sizeof(html) - n, + "<div style='color: #666; margin-bottom: 8px;'>/%s/</div>", pron); + if (pos) n += snprintf(html + n, sizeof(html) - n, + "<div style='font-style: italic; color: #888; margin-bottom: 8px;'>%s</div>", pos); + if (def) n += snprintf(html + n, sizeof(html) - n, + "<div style='margin-bottom: 8px;'>%s</div>", def); + if (ex) n += snprintf(html + n, sizeof(html) - n, + "<div style='color: #555; font-style: italic; margin-top: 8px;'>\"%s\"</div>", ex); + snprintf(html + n, sizeof(html) - n, "</div>"); + return strdup(html); +} + +static char *extract_word(const char *query) { + if (!query) return NULL; + + const char *start = query; + + for (int i = 0; PREFIXES[i]; i++) { + size_t len = strlen(PREFIXES[i]); + if (strncasecmp(start, PREFIXES[i], len) == 0) { + start += len; + break; + } + } + + while (*start == ' ') start++; + char *word = strdup(start); + if (!word) return NULL; + + int changed = 1; + while (changed) { + changed = 0; + for (int i = 0; SKIP_WORDS[i]; i++) { + size_t len = strlen(SKIP_WORDS[i]); + if (strncasecmp(word, SKIP_WORDS[i], len) == 0) { + memmove(word, word + len, strlen(word + len) + 1); + changed = 1; + break; + } + } + } + + changed = 1; + while (changed) { + changed = 0; + for (int i = 0; SUFFIXES[i]; i++) { + const char *found = strcasestr_impl(word, SUFFIXES[i]); + if (found) { + char *pos = word + (found - word); + *pos = '\0'; + changed = 1; + break; + } + } + } + + size_t len = strlen(word); + while (len > 0 && (word[len-1] == ' ' || word[len-1] == '?' || + word[len-1] == '!' || word[len-1] == '.')) { + word[--len] = '\0'; + } + + if (len == 0) { free(word); return NULL; } + + for (size_t i = 0; i < len; i++) word[i] = tolower((unsigned char)word[i]); + char *space = strchr(word, ' '); + if (space) *space = '\0'; + + return word; +} + +int is_dictionary_query(const char *query) { + if (!query) return 0; + + for (int i = 0; PREFIXES[i]; i++) { + size_t len = strlen(PREFIXES[i]); + if (strncasecmp(query, PREFIXES[i], len) == 0) { + const char *after = query + len; + while (*after == ' ') after++; + if (*after != '\0') return 1; + } + } + + for (int i = 0; SUFFIXES[i]; i++) { + const char *pos = strcasestr_impl(query, SUFFIXES[i]); + if (pos) { + const char *after = pos + strlen(SUFFIXES[i]); + while (*after == ' ' || *after == '?' || *after == '!' || *after == '.') after++; + if (*after == '\0' && pos > query && (pos - query) < 100) return 1; + } + } + + if (strncasecmp(query, "what is ", 8) == 0 || + strncasecmp(query, "what's ", 7) == 0 || + strncasecmp(query, "whats ", 6) == 0) { + const char *word = query + (strncasecmp(query, "what is ", 8) == 0 ? 8 : + strncasecmp(query, "what's ", 7) == 0 ? 7 : 6); + const char *articles[] = {"the ", "your ", "my ", "his ", "her ", "their ", + "our ", "this ", "that ", "these ", "those ", "a ", "an ", NULL}; + for (int i = 0; articles[i]; i++) { + if (strncasecmp(word, articles[i], strlen(articles[i])) == 0) return 0; + } + const char *space = strchr(word, ' '); + if (!space || *(space + 1) == '\0' || *(space + 1) == '?') return 1; + } + + return 0; +} + +char *construct_dictionary_url(const char *query) { + char *word = extract_word(query); + if (!word) return NULL; + + CURL *curl = curl_easy_init(); + if (!curl) { free(word); return NULL; } + + char *escaped = curl_easy_escape(curl, word, 0); + const char *base = "https://dictionary.cambridge.org/dictionary/english/"; + char *url = malloc(strlen(base) + strlen(escaped) + 1); + if (url) { + strcpy(url, base); + strcat(url, escaped); + } + + curl_free(escaped); + curl_easy_cleanup(curl); + free(word); + return url; +} + +InfoBox fetch_dictionary_data(const char *query) { + InfoBox info = {NULL, NULL, NULL, NULL}; + + char *url = construct_dictionary_url(query); + if (!url) return info; + + CURL *curl = curl_easy_init(); + if (!curl) { free(url); return info; } + + struct MemStruct chunk = {malloc(1), 0}; + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk); + curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0"); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + + if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) { + htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + if (doc) { + char *word = xpath_text(doc, "//span[@class='hw dhw']"); + char *pron = xpath_text(doc, "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']"); + char *pos = xpath_text(doc, "//span[@class='pos dpos']"); + char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]"); + char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]"); + + if (word && def) { + info.title = strdup("Dictionary"); + info.extract = build_html(word, pron, pos, def, ex); + info.thumbnail_url = strdup("/static/dictionary.jpg"); + info.url = strdup(url); + } + + free(word); free(pron); free(pos); free(def); free(ex); + xmlFreeDoc(doc); + } + } + + curl_easy_cleanup(curl); + free(chunk.memory); + free(url); + return info; +}
\ No newline at end of file |
