diff --git a/.spellcheck_ignore.txt b/.spellcheck_ignore.txt new file mode 100644 index 0000000..d40f215 --- /dev/null +++ b/.spellcheck_ignore.txt @@ -0,0 +1,207 @@ +ASAN +ascii +AVL +backend +Borisov +br +BST +CentOS +CMake +codebase +Combinatorial +Combinators +combinators +CPP +css +CSSOM +csswg +customizable +CXX +deallocates +Deallocation +distros +DNS +DOCTYPE +DOM +encoding's +encodings +Encodings +EOF +EUC +Flexbox +frontend +fuzzer +fuzzers +gb +glyphs +Homebrew +hostnames +href +html +IDN +IDNA +idna +IDNs +img +initializations +innerHTML +IoT +JIS +js +JSON +keyring +lexbor +`lexbor` +LEXBOR +`lexbor`'s +lexbor's +li +lifecycle +lightningcss +LXB +lxb +macintosh +macOS +macos +MacPorts +macports +mailto +Makefiles +malloc +maxdepth +md +mediaqueries +mem +memset +microsoft +mkdir +mq +msan +MSYS +msys +multipage +mutexes +myhtml +namespace +Namespace +Namespaces +namespaces +NFD +nFind +NFKC +NFKD +NGINX +nHTML +NJS +normalizer +Normalizer +np +nResult +nsize +nTree +num +oklab +oklch +ol +OpenType +opentype +otff +outbuf +overline +parser's +parsers +pc +png +pos +pre +Preprocessing +preprocessing +prescan +Prescanning +printf +programmatically +Punycode +punycode +px +queueing +RCDATA +realloc +reallocations +releasever +renderer +repo +repos +rfc +rgb +rgba +RHEL +rhel +Roadmap +roadmap +ru +serializer +sexpr +sizeof +slctrs +spinlocks +src +ss +sst +stderr +stdin +stdout +str +STR +strlen +struct +stylesheet +Stylesheet +StyleSheet +stylesheets +subdirectory +substring +sudo +superfast +svg +SVG +symlink +szie +textarea +tkz +tle +tmp +toctree +Tokenization +tokenization +tokenize +Tokenizer +tokenizer +tokenizer's +tokenizing +tStyleSheet +txt +typedef +UB +ubuntu +uc +UI +ul +UNDEF +unformatted +unicode +url +usr +UTF +uTf +utf +UTILS +variadic +WHATWG +whatwg +whitespace +WHITESPACE +whitespaces +WS +www +xenial +YPE \ No newline at end of file diff --git a/Makefile b/Makefile index 9c1eaf4..4ece681 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ +PYTHON ?= python3 SPHINX ?= sphinx-build -SERVER ?= python3 -m http.server +SERVER ?= $(PYTHON) -m http.server VENVDIR ?= .venv VENV ?= $(VENVDIR)/bin/activate @@ -21,7 +22,7 @@ endef # Ensure virtual environment exists $(VENVDIR): - python3 -m venv $(VENVDIR) + $(PYTHON) -m venv $(VENVDIR) # Install dependencies inside virtual environment .PHONY: install @@ -84,3 +85,11 @@ upload: clean-doc backup deploy rsync -rctvn $(DEPLOYDIR)/ $(HOST):$(REMOTEDIR) # Final sync if dry-run is successful rsync -rctv $(DEPLOYDIR)/ $(HOST):$(REMOTEDIR) + +.PHONY: linkcheck +linkcheck: + $(call venv_exec, $(SPHINX) -b linkcheck $(SOURCEDIR) $(BUILDDIR)) + +.PHONY: spellcheck +spellcheck: + $(call venv_exec, $(PYTHON) -m pyspelling -c spellcheck.yaml -j $(shell nproc)) \ No newline at end of file diff --git a/source/articles/example-CSS-selectors-easy-way.md b/source/articles/example-CSS-selectors-easy-way.md deleted file mode 100644 index f9a3239..0000000 --- a/source/articles/example-CSS-selectors-easy-way.md +++ /dev/null @@ -1,148 +0,0 @@ -# Examples: CSS selectors, the easy way - -Let's start with an easy example of using `lexbor` for parsing and serializing -CSS selectors. This example breaks down the major steps and elements, explaining -the overall purpose, requirements, and assumptions at each step. - -The code for all examples is available in our [GitHub -repository](https://github.com/lexbor/lexbor/tree/master/examples/lexbor); this -specific example can be found at -[list_easy_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/selectors/list_easy_way.c). - - -## Overall Purpose - -The example demonstrates how to use `lexbor` to parse a CSS selector string, -create a selector list, and then serialize the selector list. It also shows how -to handle parser logs and properly clean up allocated resources. - -This guide is designed to help you utilize `lexbor` for parsing and serializing -CSS selectors, with a focus on error handling and resource management. - -Please note that this is a basic (or *naive*) approach. A more advanced, -real-world example will be provided later. - - -## Major Steps and Elements - -### 1. Library Inclusion and Callback Function - -The code includes the necessary header files and defines a callback function -(`callback`) that prints the parsed data. - -```c -#include - -lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) -{ - printf("%.*s", (int) len, (const char *) data); - return LXB_STATUS_OK; -} -``` - -### 2. Main Function - -The `main` function initializes the CSS parser, parses a CSS selector string, -and then serializes the resulting selector list. - -```c -int main(int argc, const char *argv[]) -{ - // ... (variable declarations) - - // Create parser. - parser = lxb_css_parser_create(); - status = lxb_css_parser_init(parser, NULL); - - // Check if parser initialization was successful. - if (status != LXB_STATUS_OK) { - return EXIT_FAILURE; - } - - // Parse and get the log. - // ... - - // Selector List Serialization. - // ... - - // Destroy resources for Parser. - // ... - - // Destroy all Selector List memory. - // ... - - return EXIT_SUCCESS; -} -``` - - -### 3. CSS Selector String and Parser Initialization - -The code defines a CSS selector string (`slctrs`) and initializes the CSS -parser. - -```c -static const lxb_char_t slctrs[] = ":has(div, :not(as, 1%, .class), #hash)"; - -parser = lxb_css_parser_create(); -status = lxb_css_parser_init(parser, NULL); -``` - - -### 4. Parsing CSS Selector and Handling Errors - -The code parses the CSS selector string, checks for parsing errors, and prints -the result. - -```c -list = lxb_css_selectors_parse(parser, slctrs, - sizeof(slctrs) / sizeof(lxb_char_t) - 1); - -if (parser->status != LXB_STATUS_OK) { - printf("Something went wrong\n"); - return EXIT_FAILURE; -} -``` - - -### 5. Selector List Serialization and Handling Logs - -The example serializes the parsed selector list and prints any parser logs. - -```c -printf("Result: "); -(void) lxb_css_selector_serialize_list(list, callback, NULL); -printf("\n"); - -// Check if there are any parser logs. -if (lxb_css_log_length(lxb_css_parser_log(parser)) != 0) { - printf("Log:\n"); - // Serialize parser logs with proper indentation. - (void) lxb_css_log_serialize(parser->log, callback, NULL, - indent, indent_length); - printf("\n"); -} -``` - - -### 6. Resource Cleanup - -Finally, the code destroys resources for the parser and frees memory allocated -for the selector list. - -```c -(void) lxb_css_parser_destroy(parser, true); -lxb_css_selector_list_destroy_memory(list); -``` - - -## Requirements and Assumptions - -Some key points to note: - -- The CSS selector string (`slctrs`) is predefined and used for parsing. -- It is assumed that parser initialization and selector list creation are - successful. -- Error handling is demonstrated by checking the parser's status, though it can - be further improved. -- The cleanup section ensures proper destruction of parser resources and memory. diff --git a/source/articles/index.md b/source/articles/index.md index 82f83e7..078dc8e 100644 --- a/source/articles/index.md +++ b/source/articles/index.md @@ -1,4 +1,4 @@ -# Articles, Examples +# Articles This series of articles discusses various aspects of `lexbor` implementation and design choices. @@ -8,12 +8,3 @@ This series of articles discusses various aspects of `lexbor` implementation and part* ``` - -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. - -```{toctree} -:maxdepth: 1 -:glob: - -example* -``` diff --git a/source/articles/part-1-html.md b/source/articles/part-1-html.md index 3dc9506..f7498d6 100644 --- a/source/articles/part-1-html.md +++ b/source/articles/part-1-html.md @@ -166,7 +166,7 @@ In the HTML namespace, the `` tag is treated as text, so no `` element is created. In the SVG namespace, however, an element is created based on the `` tag. Thus, tags behave differently depending on the namespace. -But there’s more. The tokenizer must also be aware of the current namespace to +But there's more. The tokenizer must also be aware of the current namespace to process `CDATA` correctly. Consider two examples involving `CDATA` and two different namespaces: @@ -413,7 +413,7 @@ affect the tokenizer. These dependencies are largely due to namespaces. ## How to Solve Issues? -I will outline an HTML parser implementation for my Lexbor project, along with +I will outline an HTML parser implementation for my `lexbor` project, along with solutions to the problems discussed. ### Preprocessing @@ -674,7 +674,7 @@ tree_build_in_body_character(token) { } ``` -In Lexbor HTML: +In `lexbor` HTML: ```c tree_build_in_body_character(token) { lexbor_str_t str = {0}; @@ -748,7 +748,7 @@ move forward. Next, I will focus on CSS parsing and developing a custom grammar ## Sources The approach to parsing and HTML tree construction described here is implemented -in my [Lexbor](https://github.com/lexbor/lexbor) HTML library. +in my [`lexbor`](https://github.com/lexbor/lexbor) HTML library. ## P.S. diff --git a/source/articles/part-2-css.md b/source/articles/part-2-css.md index 4ab8a03..74e4a45 100644 --- a/source/articles/part-2-css.md +++ b/source/articles/part-2-css.md @@ -3,7 +3,7 @@ Hello, everyone! We continue our series on developing a browser engine. Better late than never! -Despite the long break, I’ll update you on the lexbor project and its current +Despite the long break, I'll update you on the lexbor project and its current status at the end of this article. In this article, we'll explore the specifics of parsing Cascading Style Sheets @@ -41,7 +41,7 @@ Recommendation, etc. You can see all stages on marked with its current development stage, ranging from early drafts to final recommendations. -We will focus on Editor’s Draft and Working Draft with a glance at +We will focus on Editor's Draft and Working Draft with a glance at Recommendation. Since W3C standards evolve slowly, by the time a module reaches Recommendation, it might already be outdated. Thus, we'll treat CSS standards as living documents, like the HTML standard. @@ -116,7 +116,7 @@ A specific structure includes: 1. Stylesheet (formerly List of Rules) 2. At-Rule 3. Qualified Rule -4. Block’s contents +4. Block's contents 5. Declaration 6. Component value 7. Simple block @@ -242,7 +242,7 @@ Parsing selectors will proceed as follows: This sounds simple, but in practice, it's more complex: - The knowledge about which stage to switch to must be passed to each module; - they don’t inherently know this. + they don't inherently know this. - We need to decide whether to consume the `{` or `}` token before passing it to the next stage. - Nesting depth must be tracked. We can't just pass control to the next module @@ -296,7 +296,7 @@ structure. This is a form of inside-out parsing. This approach is implemented in my `lexbor` project. -Here’s how it works: We set up callbacks for different stages of parsing the CSS +Here's how it works: We set up callbacks for different stages of parsing the CSS structure. Each callback is called only once at the beginning of a stage, not for every token. @@ -608,7 +608,7 @@ All these tests are valid, but the result after parsing will always be ` = a b c`. The question arises—how do we compare this with others? My intuition suggested that the task had become significantly more complicated, but a sense of determination (not foolishness) drove me to address it directly. As expected, -it didn’t work out right away; it required some thought! +it didn't work out right away; it required some thought! Consider this example: ``` @@ -633,7 +633,7 @@ inconsistencies. The most reliable solution turned out to be generating the test and the result separately. This means that forming the result goes through the same stages as -forming the test. Although this approach is costly, it’s manageable since +forming the test. Although this approach is costly, it's manageable since real-time performance is not a constraint. As a result, we now have an excellent tool for generating tests for grammars. diff --git a/source/documentation.md b/source/documentation.md index d2fbe0f..6590e06 100644 --- a/source/documentation.md +++ b/source/documentation.md @@ -171,7 +171,7 @@ make object creation and memory management in our own way. Many classic algorithms used in `lexbor` are adapted to meet the specific needs of the project. -- We're open to using third-party code, but it’s often simpler to start from +- We're open to using third-party code, but it's often simpler to start from scratch than to add extra dependencies (looking at you, Node.js). - Some functions are platform-dependent, such as threading, timers, I/O, and @@ -324,7 +324,7 @@ void - The `*_destroy` functions always check if the object is `NULL`; if so, they return `NULL`. -- If the `*_destroy` function doesn’t take the `bool self_destroy` argument, the +- If the `*_destroy` function doesn't take the `bool self_destroy` argument, the object can only be created using the `*_create` function (i.e., not on the stack). diff --git a/source/download.md b/source/download.md index 08ceb18..7c14421 100644 --- a/source/download.md +++ b/source/download.md @@ -43,7 +43,7 @@ The `lexbor` binaries are available for: 1. Download the `lexbor` [signing key](https://lexbor.com/keys/lexbor_signing.key) used for our repositories - and add it to `apt`’s keyring: + and add it to `apt`'s keyring: ```sh curl -O https://lexbor.com/keys/lexbor_signing.key @@ -115,7 +115,7 @@ The `lexbor` binaries are available for: 1. Download the `lexbor` [signing key](https://lexbor.com/keys/lexbor_signing.key) used for our repositories - and add it to `apt`’s keyring: + and add it to `apt`'s keyring: ```sh curl -O https://lexbor.com/keys/lexbor_signing.key diff --git a/source/examples/css/StyleSheet.md b/source/examples/css/StyleSheet.md new file mode 100644 index 0000000..8cf4e5f --- /dev/null +++ b/source/examples/css/StyleSheet.md @@ -0,0 +1,119 @@ +# Parsing and Serializing CSS Stylesheet + +This article explains example code that demonstrates +how to parse and serialize a CSS stylesheet using the `lexbor` library. The +example can be found in the file [lexbor/css/StyleSheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/StyleSheet.c). + +The provided code example demonstrates how to read a CSS file, parse it using +the `lexbor` library, and then serialize the parsed CSS back to a string. This +example is valuable for developers looking to understand how to interact with +CSS data programmatically using `lexbor`. + +## Key Code Sections + +### Reading the CSS File + +The first significant operation in the code is reading the contents of a CSS +file. This is done using the `lexbor_fs_file_easy_read` function which reads the +contents into memory. + +```c +fl = (const lxb_char_t *) argv[1]; + +css = lexbor_fs_file_easy_read(fl, &css_len); +if (css == NULL) { + FAILED("Failed to read CSS file"); +} +``` + +Here, `argv[1]` is expected to contain the path to the CSS file. The function +`lexbor_fs_file_easy_read` reads the file into a dynamically allocated buffer, +with `css_len` capturing the length of the data. If the file read fails, the +program exits with an error. + +### Initializing the Parser + +Next, the code initializes a `lexbor` CSS parser. This involves creating a +parser instance and initializing it. + +```c +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + FAILED("Failed to create CSS Parser"); +} +``` + +First, a new parser instance is created using `lxb_css_parser_create()`. The +`lxb_css_parser_init` function initializes this parser. If the initialization +fails, an error is reported and the program exits. + +### Parsing the Stylesheet + +Once the parser is ready, the next task is to parse the contents of the CSS +file. + +```c +sst = lxb_css_stylesheet_parse(parser, css, css_len); + +(void) lexbor_free(css); +(void) lxb_css_parser_destroy(parser, true); + +if (sst == NULL) { + FAILED("Failed to parse CSS"); +} +``` + +The function `lxb_css_stylesheet_parse` parses the CSS data stored in the buffer +`css` with length `css_len`. After parsing, the buffer is freed and the parser +is destroyed. If parsing fails, the program reports an error and exits. + +### Serializing the Stylesheet + +After parsing the stylesheet, the example serializes it back to a string using a +callback function. + +```c +status = lxb_css_rule_serialize(sst->root, callback, NULL); +if (status != LXB_STATUS_OK) { + FAILED("Failed to serialize StyleSheet"); +} +``` + +The function `lxb_css_rule_serialize` walks through the stylesheet rules, +serializing each one. The `callback` function is called for each chunk of data +during serialization. If an error occurs during serialization, the program +reports it and exits. + +### The Callback Function + +The callback function is straightforward but crucial for outputting the +serialized data. + +```c +lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx) +{ + printf("%.*s", (int) len, data); + return LXB_STATUS_OK; +} +``` + +This function simply prints each chunk of serialized data to the standard +output. The `printf` function uses the precision field to handle the length of +data correctly. + +## Notes + +- Ensure that the CSS file exists and is accessible. +- Error handling is fundamental when dealing with file operations and parsing. +- The example provides a clear pathway from reading a file to parsing and + serializing CSS. + +## Summary + +This example demonstrates effectively how to use the `lexbor` library to handle +CSS files. It highlights reading a file, parsing the CSS content, and +serializing the parsed content back to a string. Understanding this example +enables developers to manage CSS data programmatically with `lexbor`, which can +be extended and integrated into larger projects dealing with CSS manipulation. \ No newline at end of file diff --git a/source/examples/css/index.md b/source/examples/css/index.md new file mode 100644 index 0000000..dbdd504 --- /dev/null +++ b/source/examples/css/index.md @@ -0,0 +1,14 @@ +# CSS Examples + +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +selectors/* +syntax/* +syntax/tokenizer/* +``` diff --git a/source/examples/css/selectors/list_easy_way.md b/source/examples/css/selectors/list_easy_way.md new file mode 100644 index 0000000..43f5171 --- /dev/null +++ b/source/examples/css/selectors/list_easy_way.md @@ -0,0 +1,103 @@ +# Parsing and Serializing CSS Selectors with lexbor + +This article discusses a C code example from the `lexbor` library, specifically focusing on the file [lexbor/css/selectors/list_easy_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/selectors/list_easy_way.c). The example demonstrates how to parse CSS selectors and serialize them using the capabilities provided by `lexbor`. + +In this example, we'll cover the process of creating a CSS parser using `lexbor`, parsing a complex selector string, and then serializing it back to a readable form. The workflow includes initialization of the parser, parsing the selector, error handling, and cleanup of resources. + +## Key Code Sections + +### Initialization of the CSS Parser + +First, we need to create and initialize a CSS parser. + +```c +lxb_css_parser_t *parser; +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + return EXIT_FAILURE; +} +``` + +Here, `lxb_css_parser_create` allocates memory for the parser, while `lxb_css_parser_init` initializes the parser. If initialization fails (`status != LXB_STATUS_OK`), the program exits with `EXIT_FAILURE`. + +### Parsing CSS Selectors + +Next, we define our CSS selectors string and parse it. + +```c +static const lxb_char_t slctrs[] = ":has(div, :not(as, 1%, .class), #hash)"; +list = lxb_css_selectors_parse(parser, slctrs, + sizeof(slctrs) / sizeof(lxb_char_t) - 1); +if (parser->status != LXB_STATUS_OK) { + printf("Something went wrong\n"); + return EXIT_FAILURE; +} +``` + +The function `lxb_css_selectors_parse` accepts the parser, the CSS selector string, and its length. It returns a pointer to `lxb_css_selector_list_t`, which represents the parsed selector list. Error handling confirms if the parsing was successful by checking `parser->status`. + +### Callback Function for Serialization + +We use a callback function to process the serialized data. + +```c +lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx) +{ + printf("%.*s", (int) len, (const char *) data); + return LXB_STATUS_OK; +} +``` + +The callback simply prints the serialized data to the standard output. + +### Serializing the Selector List + +We serialize the list of selectors to a readable form. + +```c +printf("Result: "); +(void) lxb_css_selector_serialize_list(list, callback, NULL); +printf("\n"); +``` + +The `lxb_css_selector_serialize_list` function processes each selector in the list, calling the `callback` for each. + +### Logging and Error Messages + +If there are any log messages, we serialize and print them. + +```c +if (lxb_css_log_length(lxb_css_parser_log(parser)) != 0) { + static const lxb_char_t indent[] = " "; + static const size_t indent_length = sizeof(indent) / sizeof(lxb_char_t) - 1; + + printf("Log:\n"); + (void) lxb_css_log_serialize(parser->log, callback, NULL, indent, indent_length); + printf("\n"); +} +``` + +Here, `lxb_css_log_serialize` formats any log messages using the provided indentation and then calls the callback for each log entry. + +### Cleanup Resources + +Finally, we must clean up allocated resources. + +```c +(void) lxb_css_parser_destroy(parser, true); +lxb_css_selector_list_destroy_memory(list); +``` + +`lxb_css_parser_destroy` and `lxb_css_selector_list_destroy_memory` ensure that all memory allocated for the parser and selector list is properly freed. + +## Notes + +- The example demonstrates robust error handling tied with each crucial step. +- Serialization is handled via callback functions, which offers flexibility for different output handling needs. +- Proper memory management is critical, underscored by the cleanup section. + +## Summary + +This example from `lexbor` showcases a complete cycle from parsing a complex CSS selector string to its serialization and error logging. For users looking to leverage the `lexbor` library, understanding this example is fundamental as it highlights key functionalities: parser creation, selector parsing, serialization, and resource management. By mastering these steps, developers can efficiently integrate CSS parsing capabilities into their applications. \ No newline at end of file diff --git a/source/examples/css/selectors/list_fast_way.md b/source/examples/css/selectors/list_fast_way.md new file mode 100644 index 0000000..d02e2e9 --- /dev/null +++ b/source/examples/css/selectors/list_fast_way.md @@ -0,0 +1,147 @@ +# Understanding Fast CSS Selector Parsing with `lexbor` + +This article provides a detailed explanation of the +[lexbor/css/selectors/list_fast_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/selectors/list_fast_way.c) +example from the `lexbor` library. The example demonstrates how to efficiently +parse and process CSS selectors using `lexbor`. We will look into the key +sections, including initialization, parsing, and serialization of selectors, +highlighting the intent and logic behind these implementations. + +## Key Code Sections + +### Initialization of Memory and Parser + +The example starts with the initialization of memory and parser objects. Here are the +relevant portions of the code: + +```c +lxb_css_memory_t *memory; +lxb_css_parser_t *parser; + +/* Memory for all parsed structures. */ +memory = lxb_css_memory_create(); +status = lxb_css_memory_init(memory, 128); +if (status != LXB_STATUS_OK) { + return EXIT_FAILURE; +} + +/* Create parser. */ +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + return EXIT_FAILURE; +} +``` + +The `lxb_css_memory_create` and `lxb_css_memory_init` functions are used to create and +initialize a memory pool that will be used for storing parsed structures. The parser is +created with `lxb_css_parser_create` and initialized with `lxb_css_parser_init`. These +steps ensure that memory management is handled efficiently. + +### Memory Binding to Parser + +One crucial aspect is binding the memory pool to the parser, preventing redundant memory +allocations. The following lines achieve this: + +```c +/* Bind memory to parser */ +lxb_css_parser_memory_set(parser, memory); +``` + +By binding the memory object to the parser using `lxb_css_parser_memory_set`, the example +ensures that all parsed structures share the same memory pool, promoting efficiency and +preventing memory fragmentation. + +### Creating and Binding Selectors + +Selectors are created and bound to the parser, ensuring streamlined parsing operations: + +```c +lxb_css_selectors_t *selectors; +selectors = lxb_css_selectors_create(); +status = lxb_css_selectors_init(selectors); +if (status != LXB_STATUS_OK) { + return EXIT_FAILURE; +} +lxb_css_parser_selectors_set(parser, selectors); +``` + +The selectors object is created and initialized through `lxb_css_selectors_create` and +`lxb_css_selectors_init`. Binding the selectors to the parser with +`lxb_css_parser_selectors_set` prevents the creation of new selectors objects on each +parsing operation. + +### Parsing Selectors + +The central part of the example involves parsing the CSS selectors provided in an array: + +```c +const char *slctrs[] = { ":not()", "div #hash [refs=i]", "div.class", ... }; + +for (i = 0; slctrs[i] != NULL; i++) { + lists[i] = lxb_css_selectors_parse(parser, (const lxb_char_t *) slctrs[i], + strlen(slctrs[i])); + if (parser->status != LXB_STATUS_OK) { + /* Handle parse error */ + } else { + /* Handle parse success */ + } +} +``` + +The array `slctrs` contains various CSS selectors to parse. The `lxb_css_selectors_parse` +function is called for each selector, and its result is stored in the `lists` array. The +parser's status is checked to determine if the parsing was successful. + +### Log Serialization + +In case of errors or warnings during parsing, the logs are serialized and printed: + +```c +(void) lxb_css_log_serialize(parser->log, callback, NULL, indent, indent_length); +``` + +The `lxb_css_log_serialize` function serializes the log information, using a `callback` +to output the serialized data. This helps in diagnosing issues during the parsing process. + +### Cleanup Resources + +Once parsing is complete, the resources associated with the parser and selectors are +destroyed: + +```c +(void) lxb_css_selectors_destroy(selectors, true); +(void) lxb_css_parser_destroy(parser, true); +``` + +Destroying these resources ensures that any allocated memory is properly freed, preventing +memory leaks. + +### Outputting Results + +The parsed selector lists are then serialized and outputted: + +```c +for (i = 0; slctrs[i] != NULL; i++) { + if (lists[i] != NULL) { + (void) lxb_css_selector_serialize_list(lists[i], callback, NULL); + } +} +``` + +Each parsed selector list is serialized using `lxb_css_selector_serialize_list`, and the +results are printed. This demonstrates the outcomes of the parsing operations. + +## Notes + +- Binding memory and selectors to the parser improves efficiency and prevents + redundant memory allocations. +- Proper error handling and log serialization provide insights into parsing issues. +- Resource cleanup is essential to prevent memory leaks. + +## Summary + +This example illustrates efficient parsing of CSS selectors using the `lexbor` library by +binding memory and selectors to the parser, parsing various selectors, handling errors, +and serializing the results. Understanding these techniques is valuable for developers +looking to leverage `lexbor` for high-performance CSS parsing in their applications. \ No newline at end of file diff --git a/source/examples/css/syntax/simple_colorize.md b/source/examples/css/syntax/simple_colorize.md new file mode 100644 index 0000000..c68c93e --- /dev/null +++ b/source/examples/css/syntax/simple_colorize.md @@ -0,0 +1,199 @@ +# CSS Lexer with Colorized Output + +This example demonstrates how to use the `lexbor` library to parse a CSS file and provide colorized output based on the different types of CSS tokens encountered. The code is found in the [lexbor/css/syntax/simple_colorize.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/simple_colorize.c) file. The primary objective of this example is to showcase how to set up a CSS parser, process different CSS rules, and colorize the output dynamically to reflect the structure of CSS syntax. + +## Key Code Sections + +### Parsing Initialization + +First, let's look at the initialization process for the CSS parser, file reading, and the initial call to the parsing function: + +```c +if (argc != 2) { + fprintf(stderr, "Usage:\n"); + fprintf(stderr, "\tcolorize \n"); + FAILED("Invalid number of arguments"); +} + +fl = (const lxb_char_t *) argv[1]; + +css = lexbor_fs_file_easy_read(fl, &css_len); +if (css == NULL) { + FAILED("Failed to read CSS file"); +} + +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + FAILED("Failed to create CSS Parser"); +} + +status = css_parse(parser, css, css_len); +``` + +This part checks for a single command-line argument, reads the CSS file, initializes the `lexbor` CSS parser, and starts the parsing process using `css_parse`. + +### `css_parse` Function + +The main parsing logic is within the `css_parse` function: + +```c +static lxb_status_t +css_parse(lxb_css_parser_t *parser, const lxb_char_t *data, size_t length) +{ + css_ctx_t ctx; + lxb_css_syntax_rule_t *stack; + + ctx.data = data; + ctx.offset = 0; + + lxb_css_parser_buffer_set(parser, data, length); + + stack = lxb_css_syntax_parser_list_rules_push(parser, NULL, NULL, + &css_list_rules, + &ctx, true, + LXB_CSS_SYNTAX_TOKEN_UNDEF); + if (stack == NULL) { + return LXB_STATUS_ERROR; + } + + printf("\n"); + + return lxb_css_syntax_parser_run(parser); +} +``` + +This function sets up the parser buffer and pushes the initial parsing rules onto the stack using `lxb_css_syntax_parser_list_rules_push`. It then runs the parser by calling `lxb_css_syntax_parser_run`. + +### Callback Structures + +The following structures define callbacks for handling different CSS syntactic elements: + +```c +static const lxb_css_syntax_cb_at_rule_t css_at_rule = { + .state = css_at_rule_state, + .block = css_at_rule_block, + .failed = lxb_css_state_failed, + .end = css_at_rule_end +}; + +static const lxb_css_syntax_cb_qualified_rule_t css_qualified_rule = { + .state = css_qualified_rule_state, + .block = css_qualified_rule_block, + .failed = lxb_css_state_failed, + .end = css_qualified_rule_end +}; + +static const lxb_css_syntax_cb_list_rules_t css_list_rules = { + .cb.state = css_list_rules_state, + .cb.failed = lxb_css_state_failed, + .cb.end = css_list_rules_end, + .next = css_list_rules_next, + .at_rule = &css_at_rule, + .qualified_rule = &css_qualified_rule +}; +``` + +These structures define callbacks for handling different rules such as at-rules, qualified rules, and lists of rules. They point to specific functions that handle each part of the CSS token processing. + +### Handling Rules with Color + +Let's examine how specific rules are handled and colorized, starting with the at-rule state: + +```c +static bool +css_at_rule_state(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + css_print_token_offset(token, ctx); + + printf("\033[35m"); + css_print_token(token, ctx); + + lxb_css_syntax_parser_consume(parser); + token = lxb_css_syntax_parser_token(parser); + + printf("\033[33m"); + + css_consule_tokens(parser, token, ctx); + + printf("\033[39m"); + + return lxb_css_parser_success(parser); +} +``` + +Here, the at-rule state function sets the color (using ANSI escape codes) and prints the token while consuming and processing subsequent tokens within an at-rule block. + +### Token Serialization + +The function `css_consule_tokens` is used to serialize and print tokens: + +```c +lxb_inline void +css_consule_tokens(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + while (token != NULL && token->type != LXB_CSS_SYNTAX_TOKEN__END) { + (void) lxb_css_syntax_token_serialize(token, token_cb_f, ctx); + + lxb_css_syntax_parser_consume(parser); + token = lxb_css_syntax_parser_token(parser); + } +} +``` + +It continuously consumes and prints each token until the end of the input or a terminating token is reached. + +### Coloring Declaration Names and Values + +The following functions handle the coloring of CSS property names and values: + +```c +static bool +css_declarations_name(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + css_print_token_offset(token, ctx); + + printf("\033[31m"); + + css_consule_tokens(parser, token, ctx); + + printf("\033[39m"); + + return lxb_css_parser_success(parser); +} + +static bool +css_declarations_value(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + css_print_token_offset(token, ctx); + + printf("\033[36m"); + + while (token != NULL && token->type != LXB_CSS_SYNTAX_TOKEN__END) { + (void) lxb_css_syntax_token_serialize(token, token_cb_f, ctx); + + lxb_css_syntax_parser_consume(parser); + token = lxb_css_syntax_parser_token(parser); + } + + printf("\033[39m"); + + return lxb_css_parser_success(parser); +} +``` + +These functions respectively color CSS property names in red and their values in cyan. + +## Notes + +- **Color Codes**: The example uses ANSI escape codes (e.g., `\033[31m`) to color the output, which may not be supported on all terminals. +- **Memory Management**: It is critical to properly destroy and free the parser and allocated memory to prevent leaks. +- **Error Handling**: The example includes fundamental error handling mechanisms but may require enhancements for robustness in production systems. + +## Summary + +This example illustrates how to use the `lexbor` library effectively for parsing and colorizing CSS. The key takeaways include setting up the parser, defining callback structures to handle different CSS rules, and utilizing token serialization and ANSI escape codes for colored output. Understanding these principles helps leverage the `lexbor` library for more complex CSS parsing and processing tasks. \ No newline at end of file diff --git a/source/examples/css/syntax/structure_parse_file.md b/source/examples/css/syntax/structure_parse_file.md new file mode 100644 index 0000000..e8f580e --- /dev/null +++ b/source/examples/css/syntax/structure_parse_file.md @@ -0,0 +1,169 @@ +# Parsing CSS Syntax from File + +This example demonstrates how to parse a CSS file and interpret its syntax using the `lexbor` library. The provided C code, located in [lexbor/css/syntax/structure_parse_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/structure_parse_file.c), reads a CSS file, parses its content, and handles different CSS rules and declarations. The primary aim of this example is to show the steps involved in setting up a `lexbor` CSS parser, defining necessary callbacks, and executing the parsing process. This detailed explanation walks through the key functionality and sophisticated use of the `lexbor` library functions and data types. + +## Key Code Sections + +### Initialization and Main Function + +At the heart of the program is the `main()` function, which initializes the CSS parser and reads the CSS input file. + +```c +int +main(int argc, const char *argv[]) +{ + size_t css_len; + lxb_char_t *css; + lxb_status_t status; + lxb_css_parser_t *parser; + const lxb_char_t *fl; + + if (argc != 2) { + fprintf(stderr, "Usage:\n"); + fprintf(stderr, "\tstructure_parse_file \n"); + FAILED("Invalid number of arguments"); + } + + fl = (const lxb_char_t *) argv[1]; + + css = lexbor_fs_file_easy_read(fl, &css_len); + if (css == NULL) { + FAILED("Failed to read CSS file"); + } + + parser = lxb_css_parser_create(); + status = lxb_css_parser_init(parser, NULL); + if (status != LXB_STATUS_OK) { + FAILED("Failed to create CSS Parser"); + } + + status = css_parse(parser, css, css_len); + + (void) lexbor_free(css); + (void) lxb_css_parser_destroy(parser, true); + + if (status != LXB_STATUS_OK) { + FAILED("Failed to parse CSS"); + } + + return EXIT_SUCCESS; +} +``` + +In this segment, the program reads a CSS file, initializes the CSS parser, and invokes the `css_parse` function to start parsing. + +### Parsing Function + +The `css_parse` function sets the buffer and pushes the initial rule stack to begin parsing. + +```c +static lxb_status_t +css_parse(lxb_css_parser_t *parser, const lxb_char_t *data, size_t length) +{ + lxb_css_syntax_rule_t *stack; + + lxb_css_parser_buffer_set(parser, data, length); + + stack = lxb_css_syntax_parser_list_rules_push(parser, NULL, NULL, + &css_list_rules, + NULL, true, + LXB_CSS_SYNTAX_TOKEN_UNDEF); + if (stack == NULL) { + return LXB_STATUS_ERROR; + } + + return lxb_css_syntax_parser_run(parser); +} +``` + +Here, `lxb_css_parser_buffer_set` assigns the data to the parser, and `lxb_css_syntax_parser_list_rules_push` initializes the entry point for parsing, specifying callbacks for handling list rules. + +### Callback: Handling List Rules + +Callbacks manage the state transitions and actions for different parts of the CSS syntax. For example, the `css_list_rules_state` is invoked when starting to process a list of rules. + +```c +static bool +css_list_rules_state(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + PRINT("Begin List Of Rules"); + + return lxb_css_parser_success(parser); +} + +static bool +css_list_rules_next(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + PRINT("Next List Of Rules"); + + return lxb_css_parser_success(parser); +} +``` + +These callbacks print messages indicating the start and continuation of rule listings in the CSS file and signify successful parsing. + +### Callback: Handling At-Rules + +At-rules (`@` rules) such as `@media` or `@keyframes` have dedicated callbacks. + +```c +static bool +css_at_rule_state(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + PRINT("Begin At-Rule Prelude"); + + css_consule_tokens(parser, token, ctx); + + printf("\n\n"); + + return lxb_css_parser_success(parser); +} + +static bool +css_at_rule_block(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + PRINT("Begin At-Rule Block"); + + css_consule_tokens(parser, token, ctx); + + printf("\n\n"); + + return lxb_css_parser_success(parser); +} +``` + +These functions print messages and consume tokens associated with at-rule prelude and block contexts. + +### Consuming Tokens + +The `css_consule_tokens` function processes tokens used across many callbacks to parse the token stream effectively. + +```c +lxb_inline void +css_consule_tokens(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + while (token != NULL && token->type != LXB_CSS_SYNTAX_TOKEN__END) { + (void) lxb_css_syntax_token_serialize(token, token_cb_f, ctx); + + lxb_css_syntax_parser_consume(parser); + token = lxb_css_syntax_parser_token(parser); + } +} +``` + +This loop continues consuming tokens until the end of token stream, serializing and printing each token. + +## Notes + +- **Initialization**: Correct initialization and cleanup of the parser are essential for avoiding memory leaks. +- **Callback Mechanism**: The versatile use of callbacks for various states (e.g., at-rules, declarations) makes it easy to extend the parser functionality. +- **Token Handling**: Efficient handling and processing of tokens ensure correct CSS parsing and interpretation. + +## Summary + +The example code in [lexbor/css/syntax/structure_parse_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/structure_parse_file.c) serves as an excellent illustration of parsing CSS files using the `lexbor` library. By walking through the setup, parsing mechanics, and token handling, one can gain a solid understanding of how to leverage `lexbor` for CSS parsing tasks. This example lays the foundation for more advanced CSS manipulation and analysis using `lexbor`. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/chunks_stdin.md b/source/examples/css/syntax/tokenizer/chunks_stdin.md new file mode 100644 index 0000000..95a2405 --- /dev/null +++ b/source/examples/css/syntax/tokenizer/chunks_stdin.md @@ -0,0 +1,119 @@ +# Tokenizing CSS from Standard Input + +The file [lexbor/css/syntax/tokenizer/chunks_stdin.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/chunks_stdin.c) demonstrates how to tokenize CSS data read from standard input using the `lexbor` library. This article will delve into the key parts of this example, explaining the purpose and workings of each section. + +## Key Code Sections + +### Callback for Token Serialization + +The function `callback` is used to handle the serialized tokens. It simply prints the token data to the standard output. + +```c +lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx) +{ + printf("%s", (const char *) data); + + return LXB_STATUS_OK; +} +``` + +This demonstrates a basic usage of `lxb_css_syntax_token_serialize`, indicating how tokens will be rendered and processed. + +### Handling Input in Chunks + +The function `chunk_cb` reads data from standard input into a buffer, allowing the tokenizer to process it in chunks. This is particularly useful for handling large inputs gracefully. + +```c +lxb_status_t +chunk_cb(lxb_css_syntax_tokenizer_t *tkz, const lxb_char_t **data, + const lxb_char_t **end, void *ctx) +{ + size_t size; + lxb_char_t *buff = ctx; + + size = fread((char *) buff, 1, BUFFER_SIZE, stdin); + if (size != BUFFER_SIZE) { + if (feof(stdin)) { + tkz->eof = true; + } + else { + return EXIT_FAILURE; + } + } + + *data = buff; + *end = buff + size; + + return LXB_STATUS_OK; +} +``` + +This function fills a buffer with a fixed size (`BUFFER_SIZE`) from `stdin`, managing the end-of-file condition by setting `tkz->eof` when necessary. This function returns `LXB_STATUS_OK` if reading proceeds without errors. + +### Tokenizing the Input + +The `main` function initializes the CSS tokenizer, sets the chunk callback, and processes tokens in a loop until the end-of-file token is encountered. + +```c +int +main(int argc, const char *argv[]) +{ + lxb_status_t status; + lxb_css_syntax_token_t *token; + lxb_css_syntax_tokenizer_t *tkz; + lxb_css_syntax_token_type_t type; + const lxb_char_t *name; + char inbuf[BUFFER_SIZE]; + + tkz = lxb_css_syntax_tokenizer_create(); + status = lxb_css_syntax_tokenizer_init(tkz); + if (status != LXB_STATUS_OK) { + PRINT("Failed to create CSS:Syntax parser"); + goto failed; + } + + lxb_css_syntax_tokenizer_chunk_cb_set(tkz, chunk_cb, inbuf); + + do { + token = lxb_css_syntax_token(tkz); + if (token == NULL) { + PRINT("Failed to parse CSS"); + goto failed; + } + + name = lxb_css_syntax_token_type_name_by_id(token->type); + printf("%s: ", (const char *) name); + + lxb_css_syntax_token_serialize(token, callback, NULL); + printf("\n"); + + type = lxb_css_syntax_token_type(token); + + lxb_css_syntax_token_consume(tkz); + } + while (type != LXB_CSS_SYNTAX_TOKEN__EOF); + + lxb_css_syntax_tokenizer_destroy(tkz); + + return EXIT_SUCCESS; + +failed: + + lxb_css_syntax_tokenizer_destroy(tkz); + + return EXIT_FAILURE; +} +``` + +Here, the tokenizer is created and initialized, and the chunk callback is set with a buffer for data. The loop continues to fetch tokens, prints their names and serialized content, and consumes each token until the end-of-file token is reached. + +## Notes + +- **Buffer Size**: The buffer size (`BUFFER_SIZE`) is set to 32 to demonstrate handling small chunks of data. This size can be adjusted based on specific needs. +- **Error Handling**: The example includes basic error handling, with appropriate messages and clean-up. +- **EOF Management**: The end-of-file is managed using `tkz->eof`, ensuring the tokenizer knows when no more data is available. + +## Summary + +This example illustrates how to tokenize CSS data read from standard input, demonstrating key aspects of using the `lexbor` library. It covers initialization, setting up a chunk callback function, handling tokens, and managing end-of-file conditions. Understanding these steps is crucial for effectively working with `lexbor` to tokenize CSS or other similar structured data inputs. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/from_file.md b/source/examples/css/syntax/tokenizer/from_file.md new file mode 100644 index 0000000..c070289 --- /dev/null +++ b/source/examples/css/syntax/tokenizer/from_file.md @@ -0,0 +1,99 @@ +# Parsing a CSS File with `lexbor` + +This article focuses on the source file [lexbor/css/syntax/tokenizer/from_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/from_file.c) and explains how to parse a CSS file using the `lexbor` library. This explanation delves into the specific functions and methods employed to tokenize and handle CSS content, illustrating a practical approach to CSS parsing with `lexbor`. + +The example code demonstrates how to read a CSS file, tokenize its content using `lexbor`'s CSS syntax tokenizer, and print each recognized token. Understanding this example provides insight into the fundamental use of `lexbor` for processing CSS files, which is critical for many web development and parsing applications that require robust CSS manipulation. + +## Key Code Sections + +### Reading the CSS File + +The initial step in the code involves reading a CSS file. This is accomplished using `lexbor`'s file reading utility: + +```c +css = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &css_len); +if (css == NULL) { + FAILED("Failed to read CSS file"); +} +``` + +Here, `lexbor_fs_file_easy_read` takes the file path provided via command line arguments and reads its content into a dynamically allocated buffer. The length of the CSS content is stored in `css_len`. If the file reading fails, the program exits with an error. + +### Initializing the Tokenizer + +Once the CSS content is loaded, the next step is to initialize the tokenizer: + +```c +tkz = lxb_css_syntax_tokenizer_create(); +status = lxb_css_syntax_tokenizer_init(tkz); +if (status != LXB_STATUS_OK) { + PRINT("Failed to create CSS:Syntax parser"); + goto failed; +} +``` + +The tokenizer is created with `lxb_css_syntax_tokenizer_create` and initialized with `lxb_css_syntax_tokenizer_init`. If the initialization fails, the code jumps to the `failed` label to clean up resources and exit. + +### Setting the Buffer and Tokenizing + +After initializing the tokenizer, the CSS content is set as the buffer for the tokenizer: + +```c +lxb_css_syntax_tokenizer_buffer_set(tkz, css, css_len); +``` + +This function sets the internal buffer of the tokenizer to the CSS data, preparing it for tokenization. + +### Processing Tokens + +The core of the tokenization process involves a loop that retrieves and processes each token: + +```c +do { + token = lxb_css_syntax_token(tkz); + if (token == NULL) { + PRINT("Failed to parse CSS"); + goto failed; + } + + name = lxb_css_syntax_token_type_name_by_id(token->type); + printf("%s: ", (const char *) name); + + lxb_css_syntax_token_serialize(token, callback, NULL); + printf("\n"); + + type = lxb_css_syntax_token_type(token); + + lxb_css_syntax_token_consume(tkz); +} +while (type != LXB_CSS_SYNTAX_TOKEN__EOF); +``` + +In this loop: +- `lxb_css_syntax_token` retrieves the next token from the tokenizer. +- `lxb_css_syntax_token_type_name_by_id` gets the token's type name. +- `lxb_css_syntax_token_serialize` outputs the token's content using a callback function. +- `lxb_css_syntax_token_consume` advances the tokenizer to the next token. + +The loop continues until the end-of-file (EOF) token is encountered. + +### Cleaning Up + +Finally, once all tokens are processed, resources are cleaned up: + +```c +lxb_css_syntax_tokenizer_destroy(tkz); +lexbor_free(css); +``` + +This ensures that allocated memory is properly freed. + +## Notes + +- `lexbor_fs_file_easy_read` simplifies file reading but requires proper error handling. +- Proper initialization and cleanup of the tokenizer are crucial to avoid memory leaks. +- The tokenization loop processes each token and prints its type and content. + +## Summary + +This example illustrates how to use `lexbor` to read and tokenize CSS files. It covers essential functions for file reading, tokenizer initialization, and token processing. Understanding these steps is fundamental for developers looking to integrate CSS parsing capabilities into their applications using `lexbor`. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/print_raw.md b/source/examples/css/syntax/tokenizer/print_raw.md new file mode 100644 index 0000000..0b3be93 --- /dev/null +++ b/source/examples/css/syntax/tokenizer/print_raw.md @@ -0,0 +1,137 @@ +# CSS Tokenizer Printing + +This article explains the source code example found in [lexbor/css/syntax/tokenizer/print_raw.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/print_raw.c). This example demonstrates how to utilize the `lexbor` library to parse a CSS file and print the raw tokens produced by the tokenizer. We'll delve into the key code sections to better understand the parsing process and token management with `lexbor`. + +## Key Code Sections + +### Usage Function + +The `usage` function provides a simple command-line usage description. It's designed to inform the user about the proper way to run the program. + +```c +static void +usage(void) +{ + fprintf(stderr, "print_raw \n"); +} +``` + +This function prints the correct command-line format to `stderr`. It's invoked when the user provides incorrect arguments. + +### Colorize Callback + +The `colorize_cb` function prints tokens to the standard output. It differentiates special cases, such as dimension tokens, and handles them appropriately. + +```c +void +colorize_cb(lxb_css_syntax_token_t *token) +{ + int length; + lxb_css_syntax_token_base_t *base; + lxb_css_syntax_token_string_t *str; + + base = lxb_css_syntax_token_base(token); + length = (int) base->length; + + printf("%.*s", length, base->begin); + + if (token->type == LXB_CSS_SYNTAX_TOKEN_DIMENSION) { + str = lxb_css_syntax_token_dimension_string(token); + + /* Ident */ + length = (int) str->base.length; + + printf("%.*s", length, str->base.begin); + } +} +``` + +This function extracts the base token details and prints them. If the token is of type `LXB_CSS_SYNTAX_TOKEN_DIMENSION`, it also prints the dimension string. + +### Main Function + +The `main` function orchestrates the overall process of reading the file, initializing the tokenizer, and processing CSS tokens. + +```c +int +main(int argc, const char *argv[]) +{ + lxb_status_t status; + lxb_css_syntax_token_t *token; + lxb_css_syntax_tokenizer_t *tkz; + lxb_css_syntax_token_type_t type; + lxb_char_t *css; + size_t css_len; + + if (argc != 2) { + usage(); + FAILED("Invalid number of arguments"); + } + + css = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &css_len); + if (css == NULL) { + FAILED("Failed to read CSS file"); + } + + tkz = lxb_css_syntax_tokenizer_create(); + status = lxb_css_syntax_tokenizer_init(tkz); + if (status != LXB_STATUS_OK) { + PRINT("Failed to create CSS:Syntax parser"); + goto failed; + } +``` + +This block checks the command-line arguments and reads the content of the specified CSS file. If successful, it creates and initializes the CSS tokenizer. + +```c +tkz->with_comment = true; +lxb_css_syntax_tokenizer_buffer_set(tkz, css, css_len); + +do { + token = lxb_css_syntax_token(tkz); + if (token == NULL) { + PRINT("Failed to parse CSS"); + goto failed; + } + + colorize_cb(token); + + type = lxb_css_syntax_token_type(token); + + lxb_css_syntax_token_consume(tkz); +} while (type != LXB_CSS_SYNTAX_TOKEN__EOF); + +lxb_css_syntax_tokenizer_destroy(tkz); +lexbor_free(css); + +printf("\n"); + +return EXIT_SUCCESS; +``` + +The tokenizing loop handles each token produced by the tokenizer. Each token is processed by the `colorize_cb` function and then consumed. The loop continues until an EOF token is encountered. + +### Clean-Up and Error Handling + +If any step in the process fails, the resources are properly released, and an error code is returned. + +```c +failed: + +lxb_css_syntax_tokenizer_destroy(tkz); +lexbor_free(css); + +return EXIT_FAILURE; +``` + +This block ensures that the tokenizer and memory allocated for the CSS content are freed even if an error occurs. + +## Notes + +1. **Token Consumption**: The `lxb_css_syntax_token_consume` function advances the tokenizer to the next token. +2. **Dimension Tokens**: The example specially handles `LXB_CSS_SYNTAX_TOKEN_DIMENSION`, indicating the handling of composite tokens. +3. **Error Handling**: Proper clean-up routines ensure that resources are freed in both success and failure cases. + +## Summary + +This example effectively demonstrates how to use `lexbor` to tokenize and print CSS tokens. It highlights crucial aspects such as correct tokenizer initialization, token handling, and the importance of resource management. Understanding this pattern is essential for developers dealing with CSS parsing or similar tasks using the `lexbor` library. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decode.md b/source/examples/encoding/buffer/decode/decode.md new file mode 100644 index 0000000..d00d6c8 --- /dev/null +++ b/source/examples/encoding/buffer/decode/decode.md @@ -0,0 +1,64 @@ +# Decoding UTF-8 Strings to Code Points with `lexbor` +The code example in [lexbor/encoding/buffer/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decode.c) demonstrates how to decode a UTF-8 encoded string into individual Unicode code points using the `lexbor` library. The example illustrates initialization, decoding, and extracting code points using various `lexbor` functions and data types. + +## Key Code Sections + +### Initialization and Buffer Preparation + +```c +const lxb_char_t *data = (const lxb_char_t *) "Привет, мир!"; +const lxb_char_t *end = data + strlen((char *) data); +``` + +Here, a UTF-8 encoded string `"Привет, мир!"` is defined and its length is calculated. These will be utilized later during the decoding process. + +### Initializing the Decoder + +```c +const lxb_encoding_data_t *encoding; +lxb_status_t status; +lxb_codepoint_t cp[32]; +lxb_encoding_decode_t decode; + +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + +status = lxb_encoding_decode_init(&decode, encoding, cp, + sizeof(cp) / sizeof(lxb_codepoint_t)); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialization decoder"); +} +``` + +In this section, the UTF-8 encoding data structure is obtained with `lxb_encoding_data(LXB_ENCODING_UTF_8)`. The decoder is then initialized via `lxb_encoding_decode_init`, where `decode` is the decoder context, `encoding` provides encoding information, and `cp` is an array to store the decoded code points. The size of this array is specified in terms of the number of `lxb_codepoint_t` elements it can hold. + +### Performing the Decoding + +```c +status = encoding->decode(&decode, &data, end); +if (status != LXB_STATUS_OK) { + // In this example, this cannot happen. +} +``` + +The actual decoding process occurs with `encoding->decode(&decode, &data, end)`, taking the initialized decoder and the data buffer into account. The `data` pointer is updated during the procedure and moves towards `end`. It’s worth noting that usual error handling is omitted here, under the assumption that decoding will succeed. + +### Printing the Decoded Code Points + +```c +size_t buf_length = lxb_encoding_decode_buf_used(&decode); + +for (size_t i = 0; i < buf_length; i++) { + printf("0x%04X\n", cp[i]); +} +``` + +Finally, the number of used buffer entries (`buf_length`) is obtained using `lxb_encoding_decode_buf_used(&decode)`. A loop then iterates through the decoded code points within `cp[]`, printing each as a hexadecimal value (`0x%04X`), conforms to the Unicode code points of the original UTF-8 string. + +## Notes + +- **Error Handling**: The macro `FAILED(...)` is used for error handling, terminating the program with a corresponding message and `EXIT_FAILURE`. This ensures immediate notification of initialization failures. +- **Buffer Management**: The `cp[]` array size is set to 32, meant for handling individual code points and providing enough space for decoding without buffer overflow. +- **Assumptions**: The example assumes a successful decoding process, omitting error handling for the decoding step itself. + +## Summary +This example illustrates a foundational aspect of working with `lexbor`: converting a UTF-8 encoded string to Unicode code points. By understanding how to initialize the decoder, handle buffer management, and perform the decoding process, developers can leverage `lexbor` for advanced text processing tasks. This underscores `lexbor`’s utility in dealing with various encodings efficiently and robustly. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decoder.md b/source/examples/encoding/buffer/decode/decoder.md new file mode 100644 index 0000000..0b639a3 --- /dev/null +++ b/source/examples/encoding/buffer/decode/decoder.md @@ -0,0 +1,121 @@ +# Lexbor Encoding Decoder + +This article delves into the purpose and functionality of the code from the file [lexbor/encoding/buffer/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decoder.c). The example demonstrates how to utilize the `lexbor` library to decode text from various encodings, converting it to Unicode code points. We'll explore key sections of the code to understand how it achieves this. + +## Key Code Sections + +### Initialization and Argument Handling + +The code begins by checking command-line arguments to ensure an encoding name is provided and initializing necessary components. + +```c +if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); +} + +/* Determine encoding from first argument from command line */ +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); +if (encoding == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n\n", argv[1]); +} +``` + +Here, `argc` is checked to guarantee exactly one argument is provided. The `usage()` function outputs how to use the program if the condition isn't met. The `lxb_encoding_data_by_pre_name()` function fetches encoding data based on the provided encoding name. If the encoding cannot be found, `FAILED()` is called to print an error and exit. + +### Decoder Initialization + +Next, the decoder is initialized with the specified encoding and a buffer for storing code points: + +```c +status = lxb_encoding_decode_init(&decode, encoding, cp, sizeof(cp) / sizeof(lxb_codepoint_t)); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to initialization decoder"); +} + +status = lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to set replacement code points for decoder"); +} +``` + +The `lxb_encoding_decode_init()` function initializes the decoder, and `lxb_encoding_decode_replace_set()` sets replacement code points to handle invalid sequences during decoding. Both functions return a status code that must be checked to prevent further errors. + +### Reading and Decoding Input + +The core of the example is a loop that reads from `stdin` and decodes the data: + +```c +do { + /* Read standard input */ + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } else { + FAILED(false, "Failed to read stdin"); + } + } + + /* Decode incoming data */ + data = (const lxb_char_t *) inbuf; + end = data + read_size; + + do { + status = encoding->decode(&decode, &data, end); + + buf_length = lxb_encoding_decode_buf_used(&decode); + + for (size_t i = 0; i < buf_length; i++) { + if (cp[i] >= 0x00A0) { + /* Code point is Unicode */ + printf("\\u%04X", cp[i]); + } else { + /* Code point is ASCII */ + printf("\\x%02X", cp[i]); + } + } + + lxb_encoding_decode_buf_used_set(&decode, 0); + } while (status == LXB_STATUS_SMALL_BUFFER); +} while (loop); +``` + +This section reads input into `inbuf` and updates the decoder with `encoding->decode()`. It processes the buffer in chunks, printing converted code points as either Unicode or ASCII, depending on their values. The `lxb_encoding_decode_buf_used()` function returns the number of decoded code points, and this information is used to print the decoded values. + +### Finishing the Decoding Process + +Finally, after all input has been processed, the decoder flushes any remaining code points: + +```c +(void) lxb_encoding_decode_finish(&decode); + +/* + * We need to check the out buffer after calling the finish function. + * If there was not enough data to form a code point, then the finish + * function will add the replacement character to the out buffer. + */ +buf_length = lxb_encoding_decode_buf_used(&decode); + +if (buf_length != 0) { + for (size_t i = 0; i < buf_length; i++) { + if (cp[i] >= 0x00A0) { + printf("\\u%04X", cp[i]); + } else { + printf("\\x%02X", cp[i]); + } + } +} +``` + +The `lxb_encoding_decode_finish()` function ensures all data is processed, adding replacement characters if necessary. The remaining code points are then printed similarly to the earlier steps. + +## Notes + +- **Error Handling**: The use of the `FAILED()` macro ensures graceful termination upon encountering errors. +- **Encoding Support**: The `usage()` function lists the supported encodings that the program can handle. +- **Buffer Management**: Adequate handling of input and decoding buffers is critical for managing memory and ensuring correct decoding. + +## Summary + +This example demonstrates how to use the `lexbor` library for decoding text from various encodings to Unicode code points. Key aspects include initializing the decoder, reading input in manageable chunks, handling errors gracefully, and ensuring all data is processed. Understanding this example is valuable for leveraging `lexbor` in applications requiring robust text encoding handling. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/validate.md b/source/examples/encoding/buffer/decode/validate.md new file mode 100644 index 0000000..db1b99a --- /dev/null +++ b/source/examples/encoding/buffer/decode/validate.md @@ -0,0 +1,111 @@ +# Validating and Replacing Invalid UTF-8 Encodings + +This article explains the example file [lexbor/encoding/buffer/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/validate.c) +which demonstrates how to decode a UTF-8 encoded string and handle invalid byte +sequences by replacing them with a specific replacement sequence using the `lexbor` library. + +The purpose of the example is to +show how to initialize a decoder, set replacement sequences for invalid byte +sequences, and decode a UTF-8 string, handling errors gracefully. This example +is useful to those needing to ensure robust UTF-8 decoding in their applications. + +## Key Code Sections + +### Initialization of Encoding Data + +In the first significant part of the code, we initialize the `lexbor` encoding +data for UTF-8: + +```c +const lxb_encoding_data_t *encoding; + +/* Initialize for UTF-8 encoding */ +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); +``` + +This uses the `lxb_encoding_data` function to obtain a pointer to the encoding +data for UTF-8, as specified by the constant `LXB_ENCODING_UTF_8`. + +### Decoder Initialization + +We then proceed with initializing the decoder by using `lxb_encoding_decode_init`: + +```c +lxb_status_t status; +lxb_codepoint_t cp[32]; +lxb_encoding_decode_t decode; + +status = lxb_encoding_decode_init(&decode, encoding, cp, + sizeof(cp) / sizeof(lxb_codepoint_t)); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialization decoder"); +} +``` + +Here, `lxb_encoding_decode_init` initializes the `decode` structure for the given +encoding and prepares it to store code points in the `cp` buffer. If this operation +fails, an error message is printed and the program exits. + +### Setting Replacement Code Points + +Invalid byte sequences are handled by setting a replacement sequence with +`lxb_encoding_decode_replace_set`: + +```c +status = lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, + LXB_ENCODING_REPLACEMENT_BUFFER_LEN); +if (status != LXB_STATUS_OK) { + FAILED("Failed to set replacement code points for decoder"); +} +``` + +By using the `LXB_ENCODING_REPLACEMENT_BUFFER` and associated length macro, +we configure the decoder to substitute invalid sequences with a predefined replacement. + +### Decoding the UTF-8 String + +The core decoding process is performed with: + +```c +const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; +const lxb_char_t *end = data + strlen((char *) data); + +status = encoding->decode(&decode, &data, end); +if (status != LXB_STATUS_OK) { + /* In this example, this cannot happen. */ +} +``` + +Here, `data` contains the UTF-8 string to be decoded, including an invalid byte +sequence (`\x80`). We call the `encoding->decode` function to process the string +and handle any invalid sequences using the previously set replacement. + +### Printing the Result + +Finally, the decoded code points are printed: + +```c +size_t buf_length = lxb_encoding_decode_buf_used(&decode); + +for (size_t i = 0; i < buf_length; i++) { + printf("0x%04X\n", cp[i]); +} +``` + +The `lxb_encoding_decode_buf_used` function returns the number of code points +stored in the buffer, which we then iterate over, printing each as a hexadecimal value. + +## Notes + +1. The `FAILED` macro is used for error handling by printing a message and exiting. +2. The invalid byte sequence, `\x80`, is replaced using the specified replacement sequence. +3. The example demonstrates how to handle both initialization and runtime errors + gracefully. + +## Summary + +This example showcases the proper use of the `lexbor` library for decoding UTF-8 +strings while managing invalid byte sequences. It covers data initialization, +decoder setup, and configurable error handling using replacement sequences. +Understanding this example is essential for developers needing robust UTF-8 +decoding in their lexbor-based applications. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encode.md b/source/examples/encoding/buffer/encode/encode.md new file mode 100644 index 0000000..64d4fdc --- /dev/null +++ b/source/examples/encoding/buffer/encode/encode.md @@ -0,0 +1,93 @@ +# Encoding Unicode Code Points to UTF-8 + +In this article, we will analyze the code example found in [lexbor/encoding/buffer/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/encode.c). This example demonstrates how to use the `lexbor` library to encode Unicode code points into a UTF-8 byte string. We will delve into the details of how the buffer is managed and how the `lexbor` encoding functions are utilized to achieve the desired result. + +## Key Code Sections + +### Setup and Initialization + +The initial part of the code sets up the environment, prepares the buffer, and initializes the encoder. Let's take a closer look: + +```c +lxb_status_t status; +lxb_encoding_encode_t encode; +const lxb_codepoint_t *cps_ref, *cps_end; +const lxb_encoding_data_t *encoding; + +/* Prepare buffer */ +lxb_char_t buffer[1024]; + +/* Unicode code points for encoding */ +lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, + 0x002C, 0x0020, 0x043C, 0x0438, 0x0440, 0x0021}; + +cps_ref = cps; +cps_end = cps_ref + (sizeof(cps) / sizeof(lxb_codepoint_t)); + +/* Initialization */ +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); +``` + +Here, a buffer is prepared to hold the UTF-8 encoded bytes. The array `cps` contains the Unicode code points to be encoded. The code points are initialized and assigned pointers, `cps_ref` and `cps_end`, which reference the start and end of the code points array. + +The `lxb_encoding_data` function is then called with `LXB_ENCODING_UTF_8` to get the encoding data for UTF-8. + +### Encoder Initialization + +Next, we initialize the encoder: + +```c +status = lxb_encoding_encode_init(&encode, encoding, buffer, sizeof(buffer)); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialize encoder"); +} +``` + +The `lxb_encoding_encode_init` function initializes the encoder with the specified encoding (UTF-8) and buffer. It takes as parameters the encoder object, the encoding data, the buffer, and its size. If initialization fails, the `FAILED` macro will output an error message and exit the program. + +### Encoding the Code Points + +With the encoder initialized, we proceed to encode the code points: + +```c +printf("Encode code points to UTF-8 byte string:\n"); + +status = encoding->encode(&encode, &cps_ref, cps_end); +if (status != LXB_STATUS_OK) { + /* In this example, this cannot happen. */ +} +``` + +Here, the `encoding->encode` function is invoked to encode the Unicode code points into the buffer as a UTF-8 string. It updates `cps_ref` to point to the next code point after the last encoded one upon completion. If encoding fails (though in this simple example it is not expected to), an error handling mechanism would be necessary. + +### Finalizing and Outputting the Encoded String + +The following lines finalize the buffer and output the result: + +```c +/* Terminate string */ +buffer[ lxb_encoding_encode_buf_used(&encode) ] = 0x00; + +/* Print result */ +cps_ref = cps; + +for (; cps_ref < cps_end; cps_ref++) { + printf("0x%04X", *cps_ref); +} + +printf("\nResult: %s\n", (char *) buffer); +``` + +The string is terminated by setting the byte after the used buffer space to `0x00`. This ensures the buffer is null-terminated, making it a valid C string. + +The original code points are printed in a loop, followed by the UTF-8 encoded result, providing a clear comparison between input code points and the final output. + +## Notes + +- The example uses UTF-8 encoding, but the `lexbor` library supports various encodings. +- Error handling is minimal in this example. Production code should robustly handle potential encoding errors. +- This example highlights the flexibility and ease-of-use of the `lexbor` library for encoding purposes. + +## Summary + +This example demonstrates how to encode an array of Unicode code points into a UTF-8 byte string using the `lexbor` library. Key takeaways include initializing the encoding environment, handling the buffers correctly, and encoding the data. Understanding this process is crucial for developers looking to work with text encoding in their applications using `lexbor`. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encoder.md b/source/examples/encoding/buffer/encode/encoder.md new file mode 100644 index 0000000..b9153a2 --- /dev/null +++ b/source/examples/encoding/buffer/encode/encoder.md @@ -0,0 +1,124 @@ +# Encoding Data with Escaped Sequences + +In this example, found in the file [lexbor/encoding/buffer/encode/encoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/encoder.c), we delve into an implementation that reads input data, processes any escaped sequences, and encodes the data using the specified character encoding. The purpose of this code is to demonstrate how the `lexbor` library can be used to handle textual data with escaped sequences and convert it to various encodings. This write-up explains key parts of the program, focusing on the logic and usage of `lexbor` functions. + +## Key Code Sections + +### Command Line Arguments Handling + +The program starts with a basic check for command line arguments, where it expects exactly one argument specifying the desired encoding. + +```c +if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); +} +``` + +This section ensures that the user provides an encoding name, and if not, it shows usage instructions and exits. + +### Fetching and Initializing Encoding + +The encoding is determined from the user-provided argument, and the encoder is initialized accordingly. + +```c +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); +if (encoding == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n", argv[1]); +} + +status = lxb_encoding_encode_init(&encode, encoding, outbuf, sizeof(outbuf)); +if (status != Lxb_STATUS_OK) { + FAILED(false, "Failed to initialize encoder"); +} +``` + +Here, `lxb_encoding_data_by_pre_name` retrieves the encoding data, and `lxb_encoding_encode_init` initializes the encoding context. + +### Setting Replacement Bytes for Encoder + +Depending on the encoding specified, replacement bytes are set. This is crucial for handling invalid or unencodable sequences. + +```c +if (encoding->encoding == Lxb_ENCODING_UTF_8) { + status = lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); +} +else { + status = lxb_encoding_encode_replace_set(&encode, (lxb_char_t *) "?", 1); +} + +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to set replacement bytes for encoder"); +} +``` + +UTF-8 has specific replacement bytes, while other encodings use a generic question mark. + +### Processing Input Data + +The program reads data from standard input in chunks of 4096 bytes, processes each chunk, and converts it into code points. + +```c +read_size = fread(inbuf, 1, sizeof(inbuf), stdin); +if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } +} + +data = (const lxb_char_t *) inbuf; +end = data + read_size; +cp_end = escaped_to_codepoint(data, end, cp, &state, &cp_rep, loop == false); +``` + +This part handles reading input, processes potential partial reads due to end-of-file, and calls the `escaped_to_codepoint` function to process the escaped sequences into code points. + +### Encoding and Output + +After converting to code points, the data is encoded and written to standard output. + +```c +do { + status = encoding->encode(&encode, &cp_ref, cp_end); + read_size = lxb_encoding_encode_buf_used(&encode); + + if (fwrite(outbuf, 1, read_size, stdout) != read_size) { + FAILED(false, "Failed to write data to stdout"); + } + + lxb_encoding_encode_buf_used_set(&encode, 0); +} +while (status == LXB_STATUS_SMALL_BUFFER); +``` + +This loop ensures that all data is properly encoded and outputted, even handling cases where the buffer might be too small on the first pass. + +### Finalizing Encoding + +At the end of processing, the encoder is finalized to flush any remaining data. + +```c +(void) lxb_encoding_encode_finish(&encode); + +read_size = lxb_encoding_encode_buf_used(&encode); +if (read_size != 0) { + if (fwrite(outbuf, 1, read_size, stdout) != read_size) { + FAILED(false, "Failed to write data to stdout"); + } +} +``` + +This ensures that any leftover data in the encoder’s internal buffer is written out. + +## Notes + +1. **Error Handling**: The macro `FAILED` helps in providing consistent error messages and exits on failure. +2. **Escaped Sequence Processing**: The function `escaped_to_codepoint` is crucial for converting escaped sequences like `\xNN` and `\uNNNN` into code points. +3. **Buffer Management**: Proper buffer management ensures that encoding processes can handle partial reads and writes effectively. + +## Summary + +This example demonstrates how to use the `lexbor` library to handle input data with escaped sequences, converting it to the specified encoding. It showcases the critical steps for initializing encoders, processing input data, handling partial reads, and finalizing output. Understanding this example is essential for those looking to leverage `lexbor` for complex text encoding tasks in their applications. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/validate.md b/source/examples/encoding/buffer/encode/validate.md new file mode 100644 index 0000000..33bef66 --- /dev/null +++ b/source/examples/encoding/buffer/encode/validate.md @@ -0,0 +1,116 @@ +# Validating Encoded Strings with `lexbor` + +This article explains the functioning of a code example found in the file +[lexbor/encoding/buffer/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/validate.c). This code demonstrates how to use +the `lexbor` library to encode a series of Unicode code points into a UTF-8 byte +string, validating and handling invalid code points along the way. + +The example showcases how to properly initialize an encoder with +the `lexbor` library, encode a sequence of Unicode code points into a UTF-8 byte +string, and manage any invalid code points encountered in the process. The +example's intent is to demonstrate the practical use of the `lexbor` encoding +library for encoding and validating Unicode sequences. + +## Key Code Sections + +### Initialization and Buffer Preparation + +The first critical step is to initialize the encoder and prepare the buffer for +the encoded output. + +```c +lxb_encoding_encode_t encode; +const lxb_codepoint_t *cps_ref, *cps_end; +const lxb_encoding_data_t *encoding; + +/* Prepare buffer */ +lxb_char_t buffer[1024]; +``` + +Here, the `encode` structure is declared to hold the encoder state. An array of +Unicode code points (`cps`) is prepared, consisting of valid and one invalid +code point (`0x110000`). Buffer size of 1024 bytes is allocated to hold the +encoded string. + +### Setting Up the Encoder + +```c +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + +status = lxb_encoding_encode_init(&encode, encoding, buffer, sizeof(buffer)); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialize encoder"); +} + +status = lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, + LXB_ENCODING_REPLACEMENT_SIZE); +if (status != LXB_STATUS_OK) { + FAILED("Failed to set replacement bytes for encoder"); +} +``` + +The encoder is initialized with the `UTF-8` encoding and the provided buffer. +The function `lxb_encoding_encode_init` takes care of this initialization. +Additionally, `lxb_encoding_encode_replace_set` sets the replacement bytes to +handle invalid code points. This ensures that invalid entries are substituted +with a predefined replacement sequence. + +### Encoding the Unicode Code Points + +```c +cps_ref = cps; +cps_end = cps_ref + (sizeof(cps) / sizeof(lxb_codepoint_t)); + +printf("Encode code points to UTF-8 byte string:\n"); + +/* Encode */ +status = encoding->encode(&encode, &cps_ref, cps_end); +if (status != LXB_STATUS_OK) { + /* In this example, this cannot happen. */ +} +``` + +The encoder processes the Unicode code points with the `encode` function, +transforming them into a UTF-8 byte string. The output buffer will contain the +encoded byte string, and a bad code point is replaced by the replacement bytes +set previously. + +### Finalizing the Encoding and Printing the Result + +```c +buffer[ lxb_encoding_encode_buf_used(&encode) ] = 0x00; + +/* Print result */ +cps_ref = cps; + +for (; cps_ref < cps_end; cps_ref++) { + printf("0x%04X", *cps_ref); +} + +printf("\nResult: %s\n", (char *) buffer); +``` + +The encoded string is null-terminated using the `lxb_encoding_encode_buf_used` +to get the actual length of the encoded content. The original code points and +the resulting encoded string are printed to the stdout, showcasing how the +encoder dealt with the input, including the invalid code point. + +## Notes + +- **Error Handling**: Proper error handling is demonstrated with the `FAILED` + macro, ensuring that the program exits if initialization or replacement + byte setup fails. +- **Invalid Code Points**: The example shows how to handle invalid Unicode code + points gracefully by setting replacement bytes. +- **Initialization and Finalization**: Correct encoder initialization, buffer + setup, and string termination are important for ensuring the accuracy and + safety of the encoding process. + +## Summary + +This example demonstrates fundamental techniques in using the `lexbor` encoding +library for converting Unicode code points to a UTF-8 byte string. It emphasizes +error handling, the importance of setting replacement bytes for invalid code +points, and proper buffer management. Understanding these concepts is crucial +for developers working with Unicode text processing and encoding using the +`lexbor` library. \ No newline at end of file diff --git a/source/examples/encoding/buffer/from_to.md b/source/examples/encoding/buffer/from_to.md new file mode 100644 index 0000000..f2ea91f --- /dev/null +++ b/source/examples/encoding/buffer/from_to.md @@ -0,0 +1,172 @@ +# Character Encoding Conversion + +This document explains the [lexbor/encoding/buffer/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/from_to.c) file in the `lexbor` library, which demonstrates how to read input data, decode it using one encoding, and then encode it with another encoding. This example highlights core functionalities of lexbor's encoding module. + +## Key Code Sections + +### Encoding Data Initialization + +The program starts by verifying the command-line arguments and retrieving the corresponding encoding data for the given `from` and `to` encodings. The `lxb_encoding_data_by_pre_name` function retrieves the encoding data by its name. + +```c +if (argc != 3) { + usage(); + exit(EXIT_SUCCESS); +} + +/* Get encoding data for 'from' */ +from = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], + strlen(argv[1])); +if (from == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n", argv[1]); +} + +/* Get encoding data for 'to' */ +to = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[2], + strlen(argv[2])); +if (to == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n", argv[2]); +} +``` + +The `from` and `to` variables store the encoding data retrieved based on the user's input. If the encoding names provided are invalid, the program exits with an error message. + +### Decoder and Encoder Initialization + +Next, the code initializes the decode and encode contexts using the retrieved encoding data. + +```c +/* Initialization decode */ +status = lxb_encoding_decode_init(&decode, from, cp, + sizeof(cp) / sizeof(lxb_codepoint_t)); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to initialization decoder"); +} + +status = lxb_encoding_decode_replace_set(&decode, + LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to set replacement code point for decoder"); +} + +/* Initialization encode */ +status = lxb_encoding_encode_init(&encode, to, outbuf, sizeof(outbuf)); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to initialization encoder"); +} + +if (to->encoding == LXB_ENCODING_UTF_8) { + status = lxb_encoding_encode_replace_set(&encode, + LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); +} +else { + status = lxb_encoding_encode_replace_set(&encode, (lxb_char_t *) "?", 1); +} + +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to set replacement bytes for encoder"); +} +``` + +The `lxb_encoding_decode_init` and `lxb_encoding_encode_init` functions initialize the decoder and encoder contexts, respectively. The replacements are set to handle invalid sequences during decoding and encoding. + +### Data Decoding and Encoding Loop + +The core of the program reads data from standard input, decodes it, and then encodes the resulting code points using the specified encoding. + +```c +do { + /* Read standard input */ + size = fread(inbuf, 1, sizeof(inbuf), stdin); + if (size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } + } + + /* Decode incoming data */ + data = (const lxb_char_t *) inbuf; + end = data + size; + + do { + /* Decode */ + decode_status = from->decode(&decode, &data, end); + + cp_ref = cp; + cp_end = cp + lxb_encoding_decode_buf_used(&decode); + + do { + encode_status = to->encode(&encode, &cp_ref, cp_end); + if (encode_status == LXB_STATUS_ERROR) { + cp_ref++; + encode_status = LXB_STATUS_SMALL_BUFFER; + } + + size = lxb_encoding_encode_buf_used(&encode); + + /* The printf function cannot print \x00, it can be in UTF-16 */ + if (fwrite(outbuf, 1, size, stdout) != size) { + FAILED(false, "Failed to write data to stdout"); + } + + lxb_encoding_encode_buf_used_set(&encode, 0); + } + while (encode_status == LXB_STATUS_SMALL_BUFFER); + + lxb_encoding_decode_buf_used_set(&decode, 0); + } + while (decode_status == LXB_STATUS_SMALL_BUFFER); +} +while (loop); +``` + +This segment reads the input in chunks, decodes each chunk, and encodes the result. The loop handles the possibility that the buffers might be too small to hold the decoded or encoded data entirely at once. + +### Finalization of Decoding and Encoding + +After the input is fully processed, the program finalizes the decoding and encoding operations to ensure all data is correctly handled. + +```c +/* End of file */ +/* In this moment encoder and decoder out buffer is empty */ + +/* First: finish decoding */ +(void) lxb_encoding_decode_finish(&decode); + +if (lxb_encoding_decode_buf_used(&decode)) { + cp_ref = cp; + cp_end = cp + lxb_encoding_decode_buf_used(&decode); + + (void) to->encode(&encode, &cp_ref, cp_end); + size = lxb_encoding_encode_buf_used(&encode); + + if (fwrite(outbuf, 1, size, stdout) != size) { + FAILED(false, "Failed to write data to stdout"); + } +} + +/* Second: finish encoding */ +(void) lxb_encoding_encode_finish(&encode); +size = lxb_encoding_encode_buf_used(&encode); + +if (size != 0) { + if (fwrite(outbuf, 1, size, stdout) != size) { + FAILED(false, "Failed to write data to stdout"); + } +} +``` + +The `lxb_encoding_decode_finish` and `lxb_encoding_encode_finish` functions ensure that any remaining data in the buffers is processed and outputted. + +## Notes + +- It is crucial to handle buffer sizes and potential overflows carefully to avoid data loss. +- Setting replacement characters or byte sequences helps manage invalid encoding sequences gracefully. +- Properly finalizing decoding and encoding processes ensures that all input data is correctly processed. + +## Summary + +This example illustrates how to use the `lexbor` library to convert data between different character encodings. It handles reading from standard input, decoding using one encoding, and then encoding to another, while managing buffer sizes and invalid sequences. Understanding this code helps users leverage lexbor's powerful encoding functionalities in their own applications. \ No newline at end of file diff --git a/source/examples/encoding/data_by_name.md b/source/examples/encoding/data_by_name.md new file mode 100644 index 0000000..84b786a --- /dev/null +++ b/source/examples/encoding/data_by_name.md @@ -0,0 +1,84 @@ +# Retrieve Encoding Data by Name + +This example demonstrates how to retrieve encoding data by name using the `lexbor` +library, as shown in the file [lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c). This code +illustrates the utilization of `lexbor` functions and data types to find specific +character encoding details based on a given encoding name. + +The purpose of the example is to demonstrate +how to use the `lexbor` library to query character encoding information by +providing an encoding name. This example is helpful for understanding how to +interact with the encoding module of `lexbor`, which is crucial for various +tasks such as text processing, web scraping, or any application requiring +character set conversions. + +## Key Code Sections + +### Finding Encoding Data by Name + +The main functionality of this example is encapsulated in the following lines: + +```c +enc_data = lxb_encoding_data_by_name((lxb_char_t *) "uTf-8", 5); +if (enc_data == NULL) { + return EXIT_FAILURE; +} +``` + +Here, `lxb_encoding_data_by_name` is called with the encoding name "uTf-8" +and its length (5). This function is designed to return a pointer to +`lxb_encoding_data_t` which contains information about the encoding. + +- **Function Call**: `lxb_encoding_data_by_name` converts the provided name + to a canonical form and searches for its associated encoding data. +- **Parameters**: + - `(lxb_char_t *) "uTf-8"`: The encoding name, cast to `lxb_char_t *`. + - `5`: The length of the encoding name. +- **Return Value**: The function returns a pointer to `lxb_encoding_data_t` + if the encoding is found. If not, `NULL` is returned. + +### Error Handling + +After the encoding data is retrieved, the code checks if the returned pointer +is `NULL`: + +```c +if (enc_data == NULL) { + return EXIT_FAILURE; +} +``` + +This ensures that the program handles the case where the encoding is not found +appropriately by exiting with `EXIT_FAILURE`. + +### Printing Encoding Name + +If the encoding is found, the name of the encoding is printed out: + +```c +printf("%s\n", enc_data->name); +``` + +`enc_data->name` holds the canonical encoding name. This line demonstrates +how to access and use the information within the `lxb_encoding_data_t` structure. + +## Notes + +- **Case Insensitivity**: The function `lxb_encoding_data_by_name` is + case-insensitive, as evidenced by the mixed-case input "uTf-8". +- **Canonical Form**: The returned encoding name is ensured to be in + a standard canonical form. +- **Static Data**: The encoding names and their data are typically + static and predefined within `lexbor`. + +## Summary + +This example highlights how to use the `lexbor` library to look up encoding +data by name. By invoking `lxb_encoding_data_by_name`, users can retrieve +information about specific encodings efficiently. Understanding this process +is vital for applications that handle diverse text encodings, ensuring proper +text interpretation and conversion. + +For `lexbor` users, this example provides a clear and practical method to +interact with the library’s encoding functionalities, facilitating smooth +integration into larger projects requiring robust encoding support. \ No newline at end of file diff --git a/source/examples/encoding/index.md b/source/examples/encoding/index.md new file mode 100644 index 0000000..280f861 --- /dev/null +++ b/source/examples/encoding/index.md @@ -0,0 +1,17 @@ +# Encoding Examples + +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +buffer/* +buffer/decode/* +buffer/encode/* +single/* +single/decode/* +single/encode/* +``` diff --git a/source/examples/encoding/single/decode/decode.md b/source/examples/encoding/single/decode/decode.md new file mode 100644 index 0000000..86895c5 --- /dev/null +++ b/source/examples/encoding/single/decode/decode.md @@ -0,0 +1,86 @@ +# Decoding UTF-8 to Code Points + +The example provided in [lexbor/encoding/single/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decode.c) demonstrates +how to use the `lexbor` library to decode a UTF-8 string into its respective Unicode +code points. This process involves initializing a decoder, processing each character +in the string, and handling the decoding results. + +## Key Code Sections + +### Buffer Preparation + +The example starts by defining the input string in UTF-8 and preparing pointers to +iterate through this string: + +```c +const lxb_char_t *data = (const lxb_char_t *) "Привет, мир!"; +const lxb_char_t *end = data + strlen((char *) data); +``` + +Here, `data` points to the start of the UTF-8 encoded string, and `end` points to +the address just after the last character of the string. This setup is essential +for the following decoding process. + +### Initializing the Decoder + +Next, the example code initializes the decoder for UTF-8: + +```c +const lxb_encoding_data_t *encoding; +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + +lxb_status_t status = lxb_encoding_decode_init_single(&decode, encoding); +if (status != LXB_STATUS_OK) { + FAILED("Failed to init decoder"); +} +``` + +Here, `lxb_encoding_data` retrieves the data structure for the specified encoding. +Then, `lxb_encoding_decode_init_single` initializes the decoding process using +this encoding. The function checks for successful initialization and exits if it +fails. + +### Decoding the String + +The core decoding loop processes each character in the input string: + +```c +while (data < end) { + pos = data; + + cp = encoding->decode_single(&decode, &data, end); + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + continue; + } + + printf("%.*s: 0x%04X\n", (int) (data - pos), pos, cp); +} +``` + +In each iteration of the loop: +- `pos` captures the current pointer position in the string. +- `decode_single` processes the next character, updating `data` to point to the + next position. +- If `cp` (code point) is valid, it prints the UTF-8 character and its + corresponding code point. + +The loop continues until `data` reaches the `end` of the string, effectively +decoding and printing every character. + +## Notes + +- The example is hardcoded to decode a specific UTF-8 string (`"Привет, мир!"`). +- The `decode_single` function is used for simplicity, suitable for decoding one + character at a time. +- Error handling is minimal, assuming that code points will always be valid for + the given string. + +## Summary + +This example from [lexbor/encoding/single/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decode.c) demonstrates the basic +process of decoding a UTF-8 encoded string into Unicode code points using the lexbor +library. It initializes the decoder for UTF-8, iterates through the string, and +prints each character with its corresponding Unicode code point. This showcases +the practicality and ease of using the `lexbor` library for encoding-related tasks, +highlighting essential steps like buffer preparation, decoder initialization, +and the decoding process itself. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/decoder.md b/source/examples/encoding/single/decode/decoder.md new file mode 100644 index 0000000..963f06b --- /dev/null +++ b/source/examples/encoding/single/decode/decoder.md @@ -0,0 +1,109 @@ +# Encoding Text Data with `lexbor` + +The example source file [lexbor/encoding/single/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decoder.c) provides an in-depth look at using the `lexbor` library to decode text data from various character encodings. The primary intent of this example is to demonstrate how to initialize a decoding context, read data from standard input, and correctly handle the decoding process using the `lexbor` library. This example targets developers aiming to understand the library's capabilities for text decoding and error handling. + +## Key Code Sections + +### Command-Line Argument Parsing and Usage + +The program begins by checking if the correct number of command-line arguments is provided. If not, it displays the usage information and exits. + +```c +if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); +} +``` + +The `usage` function prints the expected usage of the program, including the list of supported encodings. This helps users understand how to properly invoke the decoder and which encodings are available. + +### Encoding Initialization + +The encoding provided by the user as a command-line argument is determined using the `lxb_encoding_data_by_pre_name` function. + +```c +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); +if (encoding == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n\n", argv[1]); +} + +status = lxb_encoding_decode_init_single(&decode, encoding); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to init decoder"); +} +``` + +Here, the program retrieves the encoding data associated with the user-provided name. If the encoding is invalid, the program exits with an error message. Once the encoding data is obtained, it initializes the decoder object with `lxb_encoding_decode_init_single`. Proper initialization is crucial for subsequently processing the incoming data. + +### Reading from Standard Input + +The main decoding loop reads data from standard input in blocks and decodes them using the initialized decoder. + +```c +do { + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } + } + + data = (const lxb_char_t *) inbuf; + end = data + read_size; + + // Decoding happens here +} while (loop); +``` + +The input data is read in chunks and processed in a loop. The `fread()` function reads up to `sizeof(inbuf)` bytes from the standard input. If the read size is different (and the end of the file is not reached), it indicates an error. + +### Decoding Loop + +Inside the decoding loop, the program calls the decoder's `decode_single` method to decode individual characters. + +```c +while (data < end) { + cp = encoding->decode_single(&decode, &data, end); + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + if (cp == LXB_ENCODING_DECODE_CONTINUE) { + break; + } + printf("\\u%04X", LXB_ENCODING_REPLACEMENT_CODEPOINT); + continue; + } + + if (cp >= 0x00A0) { + printf("\\u%04X", cp); + } + else { + printf("\\x%02X", cp); + } +} +``` + +Here, `decode_single` decodes characters from the input buffer and manages input pointer `data`. Special handling is implemented for cases when the code point indicates a continuation (`LXB_ENCODING_DECODE_CONTINUE`) or an invalid character. Valid Unicode characters are printed in `\u` format, while ASCII characters are printed in `\x` format. + +### Handling Remaining Unfinished Decodings + +After the loop, if there's an indication that decoding was incomplete (i.e., if `cp` equals `LXB_ENCODING_DECODE_CONTINUE`), the program outputs a Unicode replacement character. + +```c +if (cp == LXB_ENCODING_DECODE_CONTINUE) { + printf("\\u%04X", LXB_ENCODING_REPLACEMENT_CODEPOINT); +} +``` + +This ensures that any unfinished multi-byte sequences are handled gracefully. + +## Notes + +1. **Error Handling**: The macro `FAILED` is used extensively to simplify error messages and includes conditional usage guidance. +2. **Buffer Management**: The program efficiently manages input data using a fixed-size buffer, ensuring that large input streams are handled correctly. +3. **Decoding Logic**: The implementation highlights robust decoding logic that appropriately handles different character encoding cases, including Unicode and ASCII conversions. + +## Summary + +This decoding example from the `lexbor` library demonstrates essential techniques for initializing encoding contexts, reading and decoding text data, and handling various edge cases. Being equipped with such knowledge allows developers to leverage `lexbor` for efficient and accurate character encoding transformation tasks across different applications. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/validate.md b/source/examples/encoding/single/decode/validate.md new file mode 100644 index 0000000..6559295 --- /dev/null +++ b/source/examples/encoding/single/decode/validate.md @@ -0,0 +1,102 @@ +# UTF-8 String Decoding and Validation + +This article explains a demonstrative code file [lexbor/encoding/single/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/validate.c) that +decodes and validates a UTF-8 encoded string to code points using the `lexbor` library. The example +focuses on initializing the decoder, processing each byte sequence in the input string to validate +and decode it, and handling invalid byte sequences. + +## Key Code Sections + +### Initialization and Setup + +The main function initializes the necessary variables and prepares the input buffer. This part +includes selecting the UTF-8 encoding type and initializing the decoder struct: + +```c +int +main(int argc, const char *argv[]) +{ + lxb_status_t status; + lxb_codepoint_t cp; + lxb_encoding_decode_t decode; + const lxb_encoding_data_t *encoding; + const lxb_char_t *pos; + + /* Prepare buffer */ + const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; + const lxb_char_t *end = data + strlen((char *) data); + + encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + + status = lxb_encoding_decode_init_single(&decode, encoding); + if (status != LXB_STATUS_OK) { + FAILED("Failed to init decoder"); + } +``` + +Key points: +- `lxb_encoding_data(LXB_ENCODING_UTF_8)` retrieves data for the UTF-8 encoding. +- `lxb_encoding_decode_init_single(&decode, encoding)` initializes the decoder structure for that encoding. + +### Decoding the Input String + +The core of the decoding process involves a loop to read each byte sequence of the input string and +convert it to Unicode code points: + +```c + printf("Decode and validate UTF-8 string \"%s\" to code points:\n", (char *) data); + + while (data < end) { + pos = data; + + cp = encoding->decode_single(&decode, &data, end); + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + printf("Bad byte sequences: 0x%04X; Replaced to: 0x%04X ('%s')\n", + *pos, LXB_ENCODING_REPLACEMENT_CODEPOINT, + LXB_ENCODING_REPLACEMENT_BYTES); + + continue; + } + + printf("%.*s: 0x%04X\n", (int) (data - pos), pos, cp); + } +``` + +Key points: +- The loop runs while `data` is less than `end` to process each byte sequence. +- `encoding->decode_single(&decode, &data, end)` performs the core decoding of the current byte sequence. +- If the decoded code point `cp` exceeds `LXB_ENCODING_DECODE_MAX_CODEPOINT`, it handles this invalid + byte sequence by replacing it with a predefined replacement code point and bytes. + +### Handling Invalid Byte Sequences + +When encountering invalid byte sequences, the code prints out an error message and continues: + +```c + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + printf("Bad byte sequences: 0x%04X; Replaced to: 0x%04X ('%s')\n", + *pos, LXB_ENCODING_REPLACEMENT_CODEPOINT, + LXB_ENCODING_REPLACEMENT_BYTES); + + continue; + } +``` + +Key points: +- The check `cp > LXB_ENCODING_DECODE_MAX_CODEPOINT` determines if the decoded value is valid. +- Invalid input sequences are substituted with `LXB_ENCODING_REPLACEMENT_CODEPOINT`, and an error message + is printed using the original byte. + +## Notes + +- The `lexbor` library's decoding functions must be initialized with the specific encoding data. +- Each byte sequence in the input string is validated and can be replaced if found invalid. +- The code uses a custom macro `FAILED` to handle initialization errors and terminate execution. + +## Summary + +This example demonstrates how to decode and validate a UTF-8 encoded string using the `lexbor` +library. By initializing the decoder with UTF-8 encoding, processing each byte sequence, and +handling invalid sequences, it showcases essential functionality for anyone working with text +processing and encoding validation using `lexbor`. This provides a practical foundation for +handling encoded text robustness in applications. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encode.md b/source/examples/encoding/single/encode/encode.md new file mode 100644 index 0000000..2c37108 --- /dev/null +++ b/source/examples/encoding/single/encode/encode.md @@ -0,0 +1,69 @@ +# UTF-8 Encoding Example + +The source file under discussion is [lexbor/encoding/single/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/encode.c). This example demonstrates how to encode a sequence of Unicode code points into a UTF-8 byte string using the `lexbor` library. The example covers the initialization of the encoding process, the encoding of individual Unicode code points, and the final assembly of the encoded string. + +## Key Code Sections + +### Initialization of Buffer and Encoding Setup + +First, the code initializes the buffer and sets up the encoding structure. + +```c +lxb_char_t buffer[1024]; +lxb_char_t *data = buffer; +const lxb_char_t *end = data + sizeof(buffer); + +// Unicode code points for encoding +lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, 0x002C, + 0x0020, 0x043C, 0x0438, 0x0440, 0x0021, 0}; + +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + +status = lxb_encoding_encode_init_single(&encode, encoding); +if (status != LXB_STATUS_OK) { + FAILED("Failed to init encoder"); +} +``` +The buffer array serves as a container for the resulting UTF-8 byte string. The Unicode code points for "Привет, мир!" are specified in the `cps` array. The `lxb_encoding_data` function retrieves the encoding data for UTF-8, and `lxb_encoding_encode_init_single` initializes the `encode` structure for single character encoding. + +### Encoding Loop + +The next portion of the code encodes each Unicode code point and prints the results. + +```c +printf("Encode code points to UTF-8 byte string:\n"); + +for (size_t i = 0; cps[i] != 0; i++) { + pos = data; + + len = encoding->encode_single(&encode, &data, end, cps[i]); + if (len < LXB_ENCODING_ENCODE_OK) { + continue; + } + + printf("0x%04X: %.*s\n", cps[i], len, pos); +} +``` +Within the loop, `pos` stores the current position of `data`. The `encode_single` method encodes each code point into the buffer. `len` will be the number of bytes written, and the encoded representation of each code point is printed in hexadecimal. + +### Finalizing the Encoded String + +Finally, the code terminates the string and prints the result. + +```c +*data = 0x00; + +printf("\nResult: %s\n", (char *) buffer); +``` +Adding a null terminator `0x00` to the buffer ensures it is a well-formed C string. The full UTF-8 encoded result is then printed. + +## Notes + +1. **Buffer Initialization**: The buffer's size ensures that it can contain the encoded string, preventing overflow. +2. **Encoder Initialization**: The `lxb_encoding_encode_init_single` function is essential for setting up the encoding process. +3. **Error Handling**: The code handles potential encoding errors, although they are not expected in this specific example. +4. **String Termination**: Proper string termination is necessary for safe string operations in C. + +## Summary + +This example showcases how to encode Unicode code points into a UTF-8 byte string using the `lexbor` library. It highlights buffer management, the encoding process, and error handling. This is a useful reference for developers needing to perform character encoding tasks with lexbor, demonstrating critical library functions and proper C programming practices. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encoder.md b/source/examples/encoding/single/encode/encoder.md new file mode 100644 index 0000000..2bf28ec --- /dev/null +++ b/source/examples/encoding/single/encode/encoder.md @@ -0,0 +1,117 @@ +# Encoding Input Strings to a Specified Encoding + +This example in [lexbor/encoding/single/encode/encoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/encoder.c) demonstrates how to use the `lexbor` library to encode input strings to a specified encoding. The source file `encoder.c` provides a comprehensive example of how to handle encoding using the `lexbor` encoding library. This involves initializing the encoder, reading from standard input, processing escaped code points, and outputting the result in the specified encoding. + +## Key Code Sections + +### 1. Getting the Encoding + +The first key step is to determine the encoding based on the command-line argument provided by the user. + +```c +if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); +} + +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], + strlen(argv[1])); +if (encoding == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n", argv[1]); +} +``` + +This section reads the encoding name from the command line and retrieves the corresponding encoding data using `lxb_encoding_data_by_pre_name()`. If the encoding is not found, it prints an error message and exits. + +### 2. Initializing the Encoder + +Once the encoding is determined, we initialize the single byte encoder. + +```c +status = lxb_encoding_encode_init_single(&encode, encoding); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to init encoder"); +} +``` + +This initializes an encoder for the specified encoding using `lxb_encoding_encode_init_single()`. If initialization fails, the program exits with an error message. + +### 3. Processing Input Data + +The main loop reads from the standard input and processes each chunk of data. + +```c +do { + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } + } + + data = (const lxb_char_t *) inbuf; + end = data + read_size; + + while (data < end) { + data = escaped_to_codepoint(data, end, &cp, &state); + if (state != 0) { + if (loop || state != 3) { + break; + } + + state = 0; + } + + out = outbuf; + len = encoding->encode_single(&encode, &out, out_end, cp); + if (len < LXB_ENCODING_ENCODE_OK) { + if (len == LXB_ENCODING_ENCODE_SMALL_BUFFER) { + FAILED(false, "Failed to convert code point to bytes"); + } + + if (encoding->encoding == LXB_ENCODING_UTF_8) { + printf("%s", LXB_ENCODING_REPLACEMENT_BYTES); + continue; + } + + printf("?"); + continue; + } + + if (fwrite(outbuf, 1, len, stdout) != len) { + FAILED(false, "Failed to write data to stdout"); + } + } +} +while (loop); +``` + +This loop reads input data, processes it to convert code points to the target encoding, and then writes the result to the standard output. `escaped_to_codepoint()` is used to handle escape sequences in the input. + +### 4. Handling Escape Sequences + +The function `escaped_to_codepoint()` processes escaped code points from the input data, converting them into `lxb_codepoint_t`. + +```c +static const lxb_char_t * +escaped_to_codepoint(const lxb_char_t *data, const lxb_char_t *end, + lxb_codepoint_t *cp, int8_t *state) +{ + ... +} +``` + +This function manages the state of escape processing, ensuring that sequences are correctly translated into code points. + +## Notes + +- The use of the `FAILED()` macro simplifies error handling by printing an error message and exiting if necessary. +- This example handles a variety of encodings and demonstrates the flexibility of the `lexbor` library in encoding text data. +- Careful state management throughout the processing ensures robustness, especially when handling partial or malformed escape sequences. + +## Summary + +This example emphasizes how to use the `lexbor` library to convert input strings into a specified encoding. It covers initialization, processing input in chunks, handling escape sequences, and ensuring the encoded output is correctly written. This illustration is vital for developers looking to integrate robust encoding capabilities in their applications using `lexbor`. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/validate.md b/source/examples/encoding/single/encode/validate.md new file mode 100644 index 0000000..7e17d2a --- /dev/null +++ b/source/examples/encoding/single/encode/validate.md @@ -0,0 +1,112 @@ +# Encode and Validate Unicode Code Points + +This article explains the code from the `lexbor` library in the file [lexbor/encoding/single/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/validate.c). The example demonstrates how to encode a sequence of Unicode code points into a UTF-8 byte string and handle validation of those points, especially focusing on dealing with invalid Unicode code points. + +The example code shows how to use the `lexbor` library to encode an array of Unicode code points into UTF-8. It includes the crucial steps of initializing the encoder, iterating through the Unicode code points, encoding each point, handling errors, replacing invalid code points, and finally, outputting the encoded string. + +## Key Code Sections + +### Buffer Preparation + +The code prepares the buffer that will hold the encoded UTF-8 byte string: + +```c +/* Prepare buffer */ +lxb_char_t buffer[1024]; +lxb_char_t *data = buffer; +const lxb_char_t *end = data + sizeof(buffer); +``` + +Here, `buffer` is a fixed-size array where the encoded UTF-8 data will be stored. `data` is a pointer that will be adjusted as data is written into the buffer, and `end` marks the endpoint of the buffer to prevent overflow. + +### Defining Unicode Code Points + +A set of Unicode code points, including an invalid one, is defined for encoding: + +```c +/* Unicode code points for encoding */ +lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, + 0x002C, + 0x110000, /* <-- bad code point */ + 0x0020, 0x043C, 0x0438, 0x0440, 0x0021, 0}; +``` + +This array includes a mix of valid Unicode code points and an intentionally invalid code point (`0x110000`). The `0` at the end signifies the end of the array. + +### Initialize Encoder + +An encoder for the UTF-8 encoding is initialized: + +```c +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + +status = lxb_encoding_encode_init_single(&encode, encoding); +if (status != LXB_STATUS_OK) { + FAILED("Failed to init encoder"); +} +``` + +The `lxb_encoding_data` function fetches the encoding data structure for UTF-8, and `lxb_encoding_encode_init_single` initializes the single-byte encoder context. An error check ensures that the encoder was initialized successfully. + +### Encoding and Validation + +The code iterates over the Unicode code points array to validate and encode each point: + +```c +for (size_t i = 0; cps[i] != 0; i++) { + pos = data; + + len = encoding->encode_single(&encode, &data, end, cps[i]); + + if (len < LXB_ENCODING_ENCODE_OK) { + if (len == LXB_ENCODING_ENCODE_SMALL_BUFFER) { + break; + } + + printf("Bad code point: 0x%04X; Replaced to: %s (0x%04X)\n", + cps[i], LXB_ENCODING_REPLACEMENT_BYTES, + LXB_ENCODING_REPLACEMENT_CODEPOINT); + + memcpy(data, LXB_ENCODING_REPLACEMENT_BYTES, + LXB_ENCODING_REPLACEMENT_SIZE); + + data += LXB_ENCODING_REPLACEMENT_SIZE; + + continue; + } + + printf("0x%04X: %.*s\n", cps[i], len, pos); +} +``` + +For each code point: + +1. `pos` marks the initial position in the buffer. +2. `encoding->encode_single` attempts to encode the current code point. +3. If the return value `len` indicates an error: + - It checks if the buffer is too small (the code handles it theoretically, though it never occurs here due to enough buffer space). + - For invalid code points, it replaces them with a predefined replacement character (commonly `0xFFFD` in UTF-8). +4. If the encoding is successful, `len` specifies the number of bytes written. + +### Final Output + +The result is terminated with a null character and printed: + +```c +/* Terminate string */ +*data = 0x00; + +printf("\nResult: %s\n", (char *) buffer); +``` + +This step ensures the buffer is a valid C string and outputs the final encoded string. + +## Notes + +- The example uses `lexbor`'s encoding library for UTF-8 encoding. +- Error handling is implemented to manage invalid Unicode code points. +- The buffer is large enough to handle the encoded output, avoiding buffer overflow concerns in this context. + +## Summary + +This example demonstrates the usage of the `lexbor` library for encoding Unicode code points into UTF-8, handling errors gracefully, and replacing invalid code points. It highlights lexbor's flexibility and robustness in dealing with text encoding tasks, proving indispensable for applications needing precise control over encoding processes. By understanding this example, developers can leverage lexbor's capabilities for their encoding needs, ensuring correct handling and encoding of text data. \ No newline at end of file diff --git a/source/examples/encoding/single/from_to.md b/source/examples/encoding/single/from_to.md new file mode 100644 index 0000000..92e17a8 --- /dev/null +++ b/source/examples/encoding/single/from_to.md @@ -0,0 +1,152 @@ +# Text Conversion Through Custom Encodings + +This article will provide an in-depth explanation of the [lexbor/encoding/single/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/from_to.c) +example file. The intent of this example is to demonstrate how the `lexbor` library can be +used to read input text in one character encoding, decode it, and then encode it to another +character encoding before writing it out. The article will break down the important sections +of the code, explain the functionality provided by the `lexbor` library, and present +key insights for potential users. + +The example code uses the `lexbor` library to +create a program that reads text input in one encoding, decodes it to a universal codepoint +representation, and re-encodes it to a different encoding before outputting it. This process +involves setting up encoding and decoding specifications, handling I/O efficiently, and +managing edge cases in encoding conversion. + +## Key Code Sections + +### Command-Line Argument Processing + +The program begins by checking if the correct number of command-line arguments are provided, +which represent the 'from' and 'to' encodings. + +```c +if (argc != 3) { + usage(); + exit(EXIT_SUCCESS); +} + +/* Get encoding data for 'from' */ +from = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); +if (from == NULL) { + FAILED(true, "Failed to get encoding from name: %s", argv[1]); +} + +/* Get encoding data for 'to' */ +to = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[2], strlen(argv[2])); +if (to == NULL) { + FAILED(true, "Failed to get encoding from name: %s", argv[2]); +} +``` + +Here, the `lxb_encoding_data_by_pre_name` function retrieves the encoding data based on the +provided name. If the encoding data cannot be found, the program exits with an error. + +#### Initializing Encoders and Decoders + +The code initializes the encoding and decoding structures provided by the `lexbor` library. + +```c +status = lxb_encoding_encode_init_single(&encode, to); +if (status != LXB_STATUS_OK) { + FAILED(true, "Failed to init encoder"); +} + +status = lxb_encoding_decode_init_single(&decode, from); +if (status != LXB_STATUS_OK) { + FAILED(true, "Failed to init decoder"); +} +``` + +The `lxb_encoding_encode_init_single` and `lxb_encoding_decode_init_single` functions prepare +the encoder and decoder for the specified encodings. Handling their status ensures proper +resource initialization before processing input. + +### Reading and Processing Input Data + +The core logic of reading input data, decoding it, transforming it to a codepoint and re-encoding +is encapsulated in a loop that handles data in chunks. + +```c +do { + /* Read standard input */ + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } + } + + /* Decode incoming data */ + data = (const lxb_char_t *) inbuf; + end = data + read_size; + + while (data < end) { + /* Decode */ + cp = from->decode_single(&decode, &data, end); + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + if (cp == LXB_ENCODING_DECODE_CONTINUE && loop) { + break; + } + cp = LXB_ENCODING_REPLACEMENT_CODEPOINT; + } + + /* Encode */ + out = outbuf; + len = to->encode_single(&encode, &out, out_end, cp); + if (len < LXB_ENCODING_ENCODE_OK) { + printf("?"); + continue; + } + + if (fwrite(outbuf, 1, len, stdout) != len) { + FAILED(false, "Failed to write data to stdout"); + } + } +} +while (loop); +``` + +The input is read in chunks of 4096 bytes, decoded character by character to codepoints, and +then re-encoded using the target encoding. Any decoding errors result in a replacement codepoint +being used, while encoding errors default to printing a question mark (`?`). + +### Finalizing Encoding and Decoding + +Finally, the program ensures that any remaining buffer data is handled by finalizing the +decoding and encoding processes. + +```c +status = lxb_encoding_decode_finish_single(&decode); +if (status != LXB_STATUS_OK) { + printf("?"); +} + +out = outbuf; +len = lxb_encoding_encode_finish_single(&encode, &out, out_end); +if (len != 0) { + if (fwrite(outbuf, 1, len, stdout) != len) { + FAILED(false, "Failed to write data to stdout"); + } +} +``` + +These steps ensure that any buffered data is properly flushed out before program termination. + +## Notes + +1. The program supports a wide range of encodings, making it a versatile tool for encoding conversion. +2. Error handling and edge cases are managed to ensure the program does not crash on unexpected input. +3. The `lexbor` library provides comprehensive functions for encoding and decoding, making such + conversions straightforward. + +## Summary + +This example highlights how the `lexbor` library can be used to build a robust encoding conversion +tool. The key takeaways include understanding how to initialize encoding and decoding structures, +process input data efficiently, handle error cases gracefully, and ensure that conversions +are completed correctly before program exit. Such an understanding can facilitate building +more sophisticated text processing tools using the `lexbor` library. \ No newline at end of file diff --git a/source/examples/html/document_parse.md b/source/examples/html/document_parse.md new file mode 100644 index 0000000..a194058 --- /dev/null +++ b/source/examples/html/document_parse.md @@ -0,0 +1,68 @@ +# Parsing an HTML Document + +In this example, located in the [lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c) file, we see a typical usage scenario of the `lexbor` library for parsing an HTML document. This example demonstrates the creation of an HTML document object, basic parsing of HTML content, and serialization of the resulting DOM tree. + +The example provides a clear, concise illustration of how to initialize and use the `lexbor` library to parse an HTML document. The example highlights crucial library functions and demonstrates error handling during document creation and HTML parsing. We will analyze several important sections of the code to understand its workings. + +## Key Code Sections + +### Creating an HTML Document + +First, the code initializes the `lexbor` HTML document object. This is important because the document object forms the anchor point for subsequent parsing and manipulation operations. + +```c +document = lxb_html_document_create(); +if (document == NULL) { + FAILED("Failed to create HTML Document"); +} +``` + +Here, the `lxb_html_document_create()` function is called to allocate and initialize a new HTML document object. If the allocation fails, the program prints an error message and terminates. + +### Parsing the HTML + +Next, the example proceeds to parse a static HTML string. + +```c +status = lxb_html_document_parse(document, html, html_len); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` + +The `lxb_html_document_parse()` function is used to parse the HTML content. The function takes the document object, a pointer to the HTML data, and the length of this data. If parsing fails (indicated by a status other than `LXB_STATUS_OK`), an error message is printed, and the program halts. + +### Outputting the Parsed Content + +To aid understanding, the code prints both the original HTML content and the resulting parsed DOM tree. + +```c +PRINT("HTML:"); +PRINT("%s", (const char *) html); +``` + +```c +PRINT("\nHTML Tree:"); +serialize(lxb_dom_interface_node(document)); +``` + +The `PRINT` macro is used to output the HTML content and the resulting DOM tree. The `serialize` function (not fully shown in the excerpt) is responsible for serializing the DOM tree to a human-readable format, providing insight into the structure of the parsed document. + +### Cleaning Up + +Finally, the example demonstrates proper resource management by destroying the created HTML document. + +```c +lxb_html_document_destroy(document); +``` + +This call to `lxb_html_document_destroy()` ensures that all resources allocated to the document object are released, preventing memory leaks. + +## Notes + +- **Error Handling**: The example employs a clear error handling strategy, checking the success of crucial library calls and halting execution when failures occur. +- **Serialization**: The use of a custom `serialize` function (assumed to be defined elsewhere in the code) helps visualize the resulting DOM tree, which is beneficial for both debugging and learning purposes. + +## Summary + +This example code from [lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c) serves as an excellent starting point for understanding basic document parsing using the `lexbor` library. It covers essential aspects such as initialization, parsing, and cleanup, while also demonstrating how to handle errors effectively. Typical `lexbor` users can draw valuable insights from this example to incorporate into their own projects, particularly concerning proper resource management and direct interaction with the HTML DOM. \ No newline at end of file diff --git a/source/examples/html/document_parse_chunk.md b/source/examples/html/document_parse_chunk.md new file mode 100644 index 0000000..8fe0594 --- /dev/null +++ b/source/examples/html/document_parse_chunk.md @@ -0,0 +1,100 @@ +# Parsing HTML in Chunks with lexbor + +This article provides a detailed examination of [lexbor/html/document_parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse_chunk.c), a C code example demonstrating how to parse HTML content in chunks using the `lexbor` library. Parsing HTML in chunks can be particularly useful when dealing with streaming data, allowing for efficient and incremental data processing. + +## Key Code Sections + +### Initialization of the HTML Document + +The first critical section of this example is the initialization of the HTML document object: + +```c +document = lxb_html_document_create(); +if (document == NULL) { + FAILED("Failed to create HTML Document"); +} +``` + +Here, the `lxb_html_document_create()` function is called to create an HTML document. The function returns a pointer to the newly created `lxb_html_document_t` structure. If the creation fails, it returns `NULL`, prompting an error message. + +### Beginning the Chunk Parsing Process + +After the document is initialized, the parsing process begins with the following lines: + +```c +status = lxb_html_document_parse_chunk_begin(document); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` + +The function `lxb_html_document_parse_chunk_begin()` prepares the document object for incremental parsing. It initializes the necessary internal structures and state, ensuring that the document is ready to accept chunks of HTML data. Handling the `LXB_STATUS_OK` status ensures the operation is successful. + +### Feeding HTML Chunks to the Parser + +The code then iterates through an array of HTML chunks, feeding each one to the parser: + +```c +for (size_t i = 0; html[i][0] != '\0'; i++) { + PRINT("%s", (const char *) html[i]); + + status = lxb_html_document_parse_chunk(document, html[i], + strlen((const char *) html[i])); + if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML chunk"); + } +} +``` + +In this loop, each element of the `html` array represents a chunk of the HTML document. The `lxb_html_document_parse_chunk()` function is called with three arguments: the document, the current chunk, and the chunk's length. This function parses each chunk and updates the document's state accordingly. The code also prints each chunk before parsing it, providing a trace of the incoming data. + +### Completing the Chunk Parsing Process + +Once all chunks are processed, the code completes the parsing process: + +```c +status = lxb_html_document_parse_chunk_end(document); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` + +The `lxb_html_document_parse_chunk_end()` function finalizes the incremental parsing process. It ensures that any remaining parsing tasks are completed and the document structure is properly built. + +### Serialization of the HTML Document Tree + +The next section serializes and prints the parsed HTML document tree: + +```c +PRINT("\nHTML Tree:"); +serialize(lxb_dom_interface_node(document)); +``` + +The `serialize()` function, though not defined in this snippet, presumably converts the internal document tree into a human-readable format and prints it. The `lxb_dom_interface_node()` function provides an interface to the document's root node, which `serialize()` then processes. + +### Destruction of the HTML Document + +Finally, the document object is destroyed to free allocated resources: + +```c +lxb_html_document_destroy(document); +``` + +This function ensures that all memory and resources associated with the document object are appropriately released, preventing memory leaks. + +## Notes + +- **Chunk Parsing**: This example shows a common approach for handling streaming data by breaking it into manageable chunks. +- **Error Handling**: The code checks the status after every parsing function call, ensuring robust error detection and messaging. +- **Resource Management**: Proper creation and destruction of objects ensure efficient use of memory resources. + +## Summary + +This example demonstrates the use of the `lexbor` library for parsing HTML +content incrementally. By initializing a document, processing it in chunks, +finalizing the parse, and printing the result, users can handle large or +streaming HTML data efficiently. This pattern is crucial for applications that +need to process data as it arrives, such as web crawlers or real-time data +analytics systems. + +Understanding this example provides a solid foundation for leveraging `lexbor` in complex, data-intensive applications. \ No newline at end of file diff --git a/source/examples/html/document_title.md b/source/examples/html/document_title.md new file mode 100644 index 0000000..f52ae73 --- /dev/null +++ b/source/examples/html/document_title.md @@ -0,0 +1,139 @@ +# Manipulating HTML Document Title + +This article provides an in-depth explanation of the example code in +[lexbor/html/document_title.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_title.c), which demonstrates how to work with HTML +document titles using the `lexbor` library. The code illustrates initializing a +document, parsing an HTML string, extracting and modifying the title, and +printing the tree structure before and after the change. + +## Key Code Sections + +### Initializing the HTML Document + +The first critical step in the example is the creation of an HTML document +object. This object will represent the entire HTML structure that the lexbor +library manages. + +```c +document = lxb_html_document_create(); +if (document == NULL) { + FAILED("Failed to create HTML Document"); +} +``` + +Here, the function `lxb_html_document_create` is used to allocate and +initialize a new `lxb_html_document_t` structure. If the initialization fails, +the program will print an error message and terminate. + +### Parsing the HTML String + +Once the document is created, the example code parses a provided HTML string. + +```c +status = lxb_html_document_parse(document, html, html_len); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` + +The `lxb_html_document_parse` function takes the document object and the HTML +string along with its length to populate the document with the appropriate +nodes and structure. Proper error handling is shown to ensure that parsing +completes successfully. + +### Retrieving the Document Title + +The example demonstrates two methods for retrieving the document title: +formatted and raw. + +```c +title = lxb_html_document_title(document, &title_len); +if (title == NULL) { + PRINT("\nTitle is empty"); +} +else { + PRINT("\nTitle: %s", title); +} + +... + +title = lxb_html_document_title_raw(document, &title_len); +if (title == NULL) { + PRINT("Raw title is empty"); +} +else { + PRINT("Raw title: %s", title); +} +``` + +The `lxb_html_document_title` function retrieves the title after trimming +whitespace and normalizing spaces. Conversely, `lxb_html_document_title_raw` +returns the title exactly as it appears in the document, preserving all +original formatting and whitespace. + +### Modifying the Document Title + +Next, the example code changes the document title to a new value provided by +`new_title`. + +```c +status = lxb_html_document_title_set(document, new_title, new_title_len); +if (status != LXB_STATUS_OK) { + FAILED("Failed to change HTML title"); +} +``` + +Here, the `lxb_html_document_title_set` function is called with the new title +and its length. This function updates the document's title element, and error +handling ensures the operation completes successfully. + +### Serializing and Printing the HTML Tree + +After modifying the title, the example prints the document's tree structure +before and after the title change. + +```c +PRINT("HTML Tree: "); +serialize(lxb_dom_interface_node(document)); + +... + +PRINT("\nHTML Tree after change title: "); +serialize(lxb_dom_interface_node(document)); +``` + +The `serialize` function is used to output the tree structure, showing all +nodes and their relationships. This helps visualize the changes made to the +document. + +### Cleaning Up + +Finally, the code cleans up by destroying the document object, freeing any +resources allocated during its creation and manipulation. + +```c +lxb_html_document_destroy(document); +``` + +This is crucial to prevent memory leaks and ensure proper program termination. + +## Notes + +- **Error Handling**: Robust error handling ensures that each operation + (creation, parsing, modification) completes successfully or produces useful + output if it fails. +- **Title Retrieval vs. Raw Title**: The distinction between normalizing + whitespaces in the title versus retrieving it as-is can be important for + different application needs. +- **Resource Management**: Proper allocation and deallocation of resources are + demonstrated to maintain program stability and efficiency. + +## Summary + +In this example, we've explored the use of the `lexbor` library to manipulate an +HTML document's title. The code demonstrates document creation, HTML parsing, +title extraction, title modification, and tree serialization. Key takeaways +include understanding lexbor's various functions for title handling and the +importance of resource management and error handling. This example is a helpful +reference for developers looking to programmatically control HTML content using +lexbor. \ No newline at end of file diff --git a/source/examples/html/element_attributes.md b/source/examples/html/element_attributes.md new file mode 100644 index 0000000..19e2e01 --- /dev/null +++ b/source/examples/html/element_attributes.md @@ -0,0 +1,144 @@ +# Handling Element Attributes with `lexbor` + +This article explores the [lexbor/html/element_attributes.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_attributes.c) example, which demonstrates parsing an HTML document, manipulating DOM elements, and their attributes using the `lexbor` library. The example focuses on setting, getting, checking for existence, iterating over, changing, and finally removing attributes of a DOM element within a parsed HTML document. + +## Key Code Sections + +### Parsing the HTML Document + +```c +static const lxb_char_t html[] = "
"; +size_t html_len = sizeof(html) - 1; + +/* Parse */ +document = parse(html, html_len); +``` + +The HTML document defined as a static string is parsed using the `parse` function that constructs an `lxb_html_document_t` object. This is the initial step, setting up the environment for further DOM manipulations. + +### Creating and Using a Collection + +```c +/* Create Collection for elements */ +collection = lxb_dom_collection_make(&document->dom_document, 16); +if (collection == NULL) { + FAILED("Failed to create collection"); +} +``` + +A `lxb_dom_collection_t` is created to store elements found during searching. This is essential for working with multiple elements efficiently. The collection is initialized with a pre-defined capacity of 16 elements. + +### Finding and Accessing Elements + +```c +/* Get BODY element (root for search) */ +body = lxb_html_document_body_element(document); +element = lxb_dom_interface_element(body); + +/* Find DIV element */ +status = lxb_dom_elements_by_tag_name(element, collection, + (const lxb_char_t *) "div", 3); + +if (status != LXB_STATUS_OK || lxb_dom_collection_length(collection) == 0) { + FAILED("Failed to find DIV element"); +} +``` + +Here, the `body` element serves as the root for the search. The `lxb_dom_elements_by_tag_name` function searches for all `div` tags and stores them in the collection. Error checks ensure that the `div` elements are found successfully. + +### Setting and Appending Attributes + +```c +attr = lxb_dom_element_set_attribute(element, name, name_size, + (const lxb_char_t *) "oh God", 6); +if (attr == NULL) { + FAILED("Failed to create and append new attribute"); +} +``` + +A new attribute is appended to the `div` element using `lxb_dom_element_set_attribute`. The attribute name is "my-name" and its value is "oh God". The function creates the attribute if it doesn't already exist and appends it to the element. + +### Checking Attribute Existence + +```c +is_exist = lxb_dom_element_has_attribute(element, name, name_size); + +if (is_exist) { + PRINT("\nElement has attribute \"%s\": true", (const char *) name); +} +else { + PRINT("\nElement has attribute \"%s\": false", (const char *) name); +} +``` + +The `lxb_dom_element_has_attribute` checks whether the given attribute exists on the element. The result is printed accordingly. + +### Retrieving Attribute Value + +```c +value = lxb_dom_element_get_attribute(element, name, name_size, &value_len); +if (value == NULL) { + FAILED("Failed to get attribute value by qualified name"); +} + +PRINT("Get attribute value by qualified name \"%s\": %.*s", + (const char *) name, (int) value_len, value); +``` + +`lxb_dom_element_get_attribute` retrieves the value of the specified attribute. If the attribute is found, its value and length are returned and printed. This section shows how to access the values of element attributes. + +### Iterating Over Attributes + +```c +/* Iterator */ +PRINT("\nGet element attributes by iterator:"); +attr = lxb_dom_element_first_attribute(element); + +while (attr != NULL) { + tmp = lxb_dom_attr_qualified_name(attr, &tmp_len); + printf("Name: %s", tmp); + + tmp = lxb_dom_attr_value(attr, &tmp_len); + if (tmp != NULL) { + printf("; Value: %s\n", tmp); + } + else { + printf("\n"); + } + + attr = lxb_dom_element_next_attribute(attr); +} +``` + +Using an iterator, this section retrieves and prints all attributes of the element. `lxb_dom_element_first_attribute` gets the first attribute, and `lxb_dom_element_next_attribute` progresses through the list. + +### Changing Attribute Value + +```c +attr = lxb_dom_element_attr_by_name(element, name, name_size); +status = lxb_dom_attr_set_value(attr, (const lxb_char_t *) "new value", 9); +if (status != LXB_STATUS_OK) { + FAILED("Failed to change attribute value"); +} +``` + +Changing an attribute's value involves first retrieving the attribute using `lxb_dom_element_attr_by_name` and then setting the value with `lxb_dom_attr_set_value`. Error checking ensures that the operation is successful. + +### Removing Attributes + +```c +/* Remove new attribute by name */ +lxb_dom_element_remove_attribute(element, name, name_size); +``` + +The final operation removes the specified attribute from the element using `lxb_dom_element_remove_attribute`. This demonstrates the library's capabilities for cleaning up or updating the DOM. + +## Notes + +- Proper error handling is crucial when manipulating the DOM to ensure robust and predictable behavior. +- Iterating over attributes can provide useful insights into the current state of an element's attributes, useful for debugging or further manipulation. +- Changing and removing attributes dynamically allows for flexible DOM updates. + +## Summary + +This example demonstrates how to create, manipulate, and manage element attributes using the `lexbor` library, covering parsing HTML, finding elements, setting, retrieving, iterating over, changing, and removing attributes. These operations form the basis for extensive DOM manipulations in web development and highlight the power and flexibility of `lexbor` for such tasks. Understanding these fundamentals is essential for effectively utilizing `lexbor` in complex web applications. \ No newline at end of file diff --git a/source/examples/html/element_create.md b/source/examples/html/element_create.md new file mode 100644 index 0000000..9f1716d --- /dev/null +++ b/source/examples/html/element_create.md @@ -0,0 +1,119 @@ +# HTML Element Creation and Traversal + +In this example, sourced from the [lexbor/html/element_create.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_create.c) file, we will +delve into creating and manipulating HTML elements using the `lexbor` library. +This article provides a deep dive into the code, explaining how to dynamically +create every standardized HTML element, insert them into the document tree and +serialize the current structure. This example is pivotal for those seeking to +comprehend the intricacies of DOM manipulation with lexbor. + +## Key Code Sections + +### Initial Document Parsing + +First, we see the creation and initialization of an HTML document. + +```c +document = parse((const lxb_char_t *) "", 0); +body = lxb_html_document_body_element(document); +``` + +The `parse` function initializes an empty HTML document. The subsequent +call to `lxb_html_document_body_element` retrieves the body element of +the document. + +### Initial HTML Tree Serialization + +To observe the initial state of the HTML document, the code serializes and +prints the document. + +```c +PRINT("Inital HTML Tree:"); +serialize(lxb_dom_interface_node(document)); +printf("\n"); +``` + +Here, the `serialize` function outputs the current structure of the document +tree, which is initially empty. + +### Creating and Inserting HTML Elements + +Next, the code iterates over all known HTML tag IDs and creates corresponding +elements. + +```c +for (tag_id = LXB_TAG_A; tag_id < LXB_TAG__LAST_ENTRY; tag_id++) +{ + tag_name = lxb_tag_name_by_id(tag_id, &tag_name_len); + // Error handling omitted for brevity + + element = lxb_dom_document_create_element(&document->dom_document, + tag_name, tag_name_len, NULL); + // Error handling omitted for brevity + + if (lxb_html_tag_is_void(tag_id)) { + // Handling void elements + } + else { + text = lxb_dom_document_create_text_node(&document->dom_document, + tag_name, tag_name_len); + // Error handling omitted for brevity + + lxb_dom_node_insert_child(lxb_dom_interface_node(element), + lxb_dom_interface_node(text)); + } + serialize_node(lxb_dom_interface_node(element)); + lxb_dom_node_insert_child(lxb_dom_interface_node(body), + lxb_dom_interface_node(element)); +} +``` + +In this loop: + +1. `lxb_tag_name_by_id` retrieves the tag name associated with `tag_id`. +2. `lxb_dom_document_create_element` creates an element node for the tag. +3. If the tag is not a void element (based on the specification), a text node + with the tag name is created and appended as a child to the element. +4. `serialize_node` outputs the newly created element. +5. Finally, the element is appended to the body of the document. + +### Final HTML Tree Serialization + +After all elements are created and inserted into the document, the resulting +HTML structure is serialized and printed. + +```c +PRINT("\nTree after create elements:"); +serialize(lxb_dom_interface_node(document)); +``` + +This section provides a clear view of how the document looks after all +operations. + +### Document Cleanup + +Proper resource management is crucial. The example concludes by destroying +the document to free up memory. + +```c +lxb_html_document_destroy(document); +``` + +## Notes + +- **Document Initialization**: Creating an empty document and retrieving the body + element is fundamental for subsequent operations. +- **Element Creation**: Iterating through tag IDs systematically to create all + HTML elements showcases lexbor's comprehensive coverage of HTML tags. +- **Void Elements Handling**: Differentiation between void and non-void elements + is essential to comply with HTML specifications. +- **Serialization**: The serialization function is valuable for debugging and + inspecting the document structure. + +## Summary + +This example demonstrates the power and flexibility of the `lexbor` library for +HTML document manipulation. It covers essential operations such as parsing, +element creation, and serialization, and highlights best practices like resource +management and adherence to HTML specifications. Understanding this example is +crucial for anyone looking to effectively use lexbor for DOM manipulation tasks. \ No newline at end of file diff --git a/source/examples/html/element_innerHTML.md b/source/examples/html/element_innerHTML.md new file mode 100644 index 0000000..d1e243f --- /dev/null +++ b/source/examples/html/element_innerHTML.md @@ -0,0 +1,92 @@ +# Setting `innerHTML` Property in Lexbor + +This example in the file [lexbor/html/element_innerHTML.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_innerHTML.c) demonstrates how to use the `lexbor` library to parse an HTML document, set the `innerHTML` of a body element, and serialize the resulting DOM tree. The intent of this code is to highlight key operations in manipulating the DOM using `lexbor`, such as document parsing, element selection, and updating the DOM tree. + +## Key Code Sections + +### Parsing HTML Document + +First, we start by parsing the initial HTML document. The `parse` function reads the HTML string and constructs the corresponding DOM tree. + +```c +static const lxb_char_t html[] = "
blah-blah-blah
"; +size_t html_len = sizeof(html) - 1; + +/* Parse */ +document = parse(html, html_len); +``` + +Here, `html` contains our initial HTML code. `html_len` determines the length of this string (excluding the null terminator). Then, the `parse` function returns a `document` representing our HTML document. + +### Printing the Parsed Document + +Next, the parsed HTML document is printed for verification. + +```c +PRINT("HTML:"); +PRINT("%s", (const char *) html); +PRINT("\nTree after parse:"); +serialize(lxb_dom_interface_node(document)); +``` + +This section outputs the original HTML string and the serialized DOM tree after parsing. The `serialize` function converts the DOM tree back to a string and prints it for inspection. + +### Obtaining the `body` Element + +After parsing, we obtain the `body` element from the document for further manipulation. + +```c +/* Get BODY element */ +body = lxb_html_document_body_element(document); +``` + +This retrieves the `body` element of the parsed document, which is required to set its `innerHTML`. + +### Setting Inner HTML + +We then set the `innerHTML` of the `body` element to a new HTML string. + +```c +static const lxb_char_t inner[] = "
  • 1
  • 2
  • 3
"; +size_t inner_len = sizeof(inner) - 1; + +element = lxb_html_element_inner_html_set(lxb_html_interface_element(body), + inner, inner_len); +if (element == NULL) { + FAILED("Failed to parse innerHTML"); +} +``` + +Here, `inner` contains the new HTML to be set as the `innerHTML` of the `body` element. `inner_len` gives the length of this string. The `lxb_html_element_inner_html_set` function updates the `innerHTML` of the targeted element. An error is reported if the function fails. + +### Printing the Updated Document + +Finally, the modified DOM tree is serialized and printed. + +```c +PRINT("\nTree after innerHTML set:"); +serialize(lxb_dom_interface_node(document)); +``` + +This helps verify that the new `innerHTML` has been correctly applied to the `body` element. + +### Cleaning Up + +The last step is to clean up and free the allocated memory for the document. + +```c +/* Destroy all */ +lxb_html_document_destroy(document); +``` + +This ensures that all resources used by the document are properly released. + +## Notes + +- The `parse` function is expected to correctly handle the input HTML and generate a DOM tree. +- The function `lxb_html_element_inner_html_set` is used to set the `innerHTML` of an element and returns the modified element or `NULL` if an error occurs. +- Using `serialize` to print the DOM tree before and after modification is a good practice to verify changes made to the DOM. + +## Summary + +This example demonstrates the essential steps for manipulating an HTML document using the `lexbor` library: parsing the document, selecting elements, updating the `innerHTML`, and serializing the DOM tree. By following this process, developers can effectively manage the DOM structure of HTML documents using `lexbor`. \ No newline at end of file diff --git a/source/examples/html/elements_by_attr.md b/source/examples/html/elements_by_attr.md new file mode 100644 index 0000000..e4a2ab6 --- /dev/null +++ b/source/examples/html/elements_by_attr.md @@ -0,0 +1,113 @@ +# Extracting Elements by Attribute + +The file [lexbor/html/elements_by_attr.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_attr.c) demonstrates how to use the `lexbor` library to extract and manipulate HTML elements based on their attributes. This example illustrates a range of selection techniques, including full match, starts with, ends with, and contains. Here, we will provide an in-depth explanation of the key sections within this code to better understand its functionality. + +## Key Code Sections + +### Initialization and Parsing + +The example starts by initializing required variables and parsing the HTML document. + +```c +lxb_html_document_t *document; +const lxb_char_t html[] = "
" + "
" + "
" + "ref" + "
"; +size_t html_size = sizeof(html) - 1; + +document = parse(html, html_size); +body = lxb_dom_interface_element(document->body); +``` + +The `parse` function converts the raw HTML string into a structured `document` that `lexbor` can process. The `lxb_dom_interface_element` call retrieves the body element from the document for further manipulation. + +### Creating the Collection + +Next, the code creates a collection object to hold the selected elements. + +```c +collection = lxb_dom_collection_make(&document->dom_document, 128); +if (collection == NULL) { + FAILED("Failed to create Collection object"); +} +``` + +By calling `lxb_dom_collection_make`, a new collection is created with an initial capacity of 128 elements. This collection will be reused for different attribute selection methods. + +### Full Match Selection + +This section demonstrates how to select elements by an exact attribute match. + +```c +status = lxb_dom_elements_by_attr(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "red c++ best", 12, + true); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +PRINT("\nFull match by 'red c++ best':"); +print_collection_elements(collection); +``` + +The `lxb_dom_elements_by_attr` function is used here to find elements with the `class` attribute exactly matching "red c++ best." The result is stored in the `collection`. + +### Begin-Match Selection + +```c +status = lxb_dom_elements_by_attr_begin(body, collection, + (const lxb_char_t *) "href", 4, + (const lxb_char_t *) "http", 4, + true); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +PRINT("\nFrom begin by 'http':"); +print_collection_elements(collection); +``` + +In this snippet, `lxb_dom_elements_by_attr_begin` selects elements where the `href` attribute starts with "http". This demonstrates the flexibility of attribute-based selection. + +### End-Match Selection + +```c +status = lxb_dom_elements_by_attr_end(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "grep", 4, + true); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +PRINT("\nFrom end by 'grep':"); +print_collection_elements(collection); +``` + +The `lxb_dom_elements_by_attr_end` function selects elements where the `class` attribute ends with "grep." + +### Contain-Match Selection + +```c +status = lxb_dom_elements_by_attr_contain(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "c++ b", 5, + true); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +PRINT("\nContain by 'c++ b':"); +print_collection_elements(collection); +``` + +Lastly, `lxb_dom_elements_by_attr_contain` is used to find elements with `class` attributes containing "c++ b." + +## Notes + +- The `print_collection_elements` function efficiently serializes and prints the details of the selected elements. +- The collection is cleaned after each selection to prepare it for the next usage. +- Error handling ensures that failures in creating the collection or selecting elements are reported. + +## Summary + +This example showcases various techniques to select HTML elements by attributes using the `lexbor` library. By understanding how to utilize functions like `lxb_dom_elements_by_attr`, `lxb_dom_elements_by_attr_begin`, `lxb_dom_elements_by_attr_end`, and `lxb_dom_elements_by_attr_contain`, developers can effectively manipulate and query HTML documents based on specific attribute criteria. This is essential for tasks involving web scraping, data extraction, and document manipulation. \ No newline at end of file diff --git a/source/examples/html/elements_by_class_name.md b/source/examples/html/elements_by_class_name.md new file mode 100644 index 0000000..b818ae2 --- /dev/null +++ b/source/examples/html/elements_by_class_name.md @@ -0,0 +1,108 @@ +# Querying Elements by Class Name + +File: [lexbor/html/elements_by_class_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_class_name.c) + +This example demonstrates how to use the `lexbor` library to parse an HTML +document and retrieve all elements with a specific class name. The example +focuses on finding elements with the class name `"best"` from a given HTML +string and serializing them for output. + +## Key Code Sections + +### Parsing the HTML Document + +The first step involves parsing a hard-coded HTML string into a +`lxb_html_document_t` object that can be manipulated through the `lexbor` +library. + +```c +const lxb_char_t html[] = "
" + "
" + "
" + "
"; + +size_t html_szie = sizeof(html) - 1; + +document = parse(html, html_szie); +``` + +Here, the HTML string contains multiple `
` elements with different class +names. The `parse` function is used to convert this HTML string into a +`document` object, which can then be queried. + +### Creating a Collection + +To store the elements that match a specific query, a collection object is +created using `lxb_dom_collection_make`. + +```c +collection = lxb_dom_collection_make(&document->dom_document, 128); +if (collection == NULL) { + FAILED("Failed to create Collection object"); +} +``` + +The collection is initialized with a capacity of 128 elements, a reasonable +default size for various use cases. + +### Querying by Class Name + +The core functionality of this example is querying the parsed document by a +specific class name using `lxb_dom_elements_by_class_name`. + +```c +status = lxb_dom_elements_by_class_name(lxb_dom_interface_element(document->body), + collection, (const lxb_char_t *) "best", 4); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +``` + +Here, the function `lxb_dom_elements_by_class_name` is called with the root +element of the document's body, the collection to store results, the class name +`"best"` (as a `const lxb_char_t *`), and the length of the class name (which is +`4`). This function searches for all elements with the class name `"best"` and +stores them in the collection. + +### Serializing and Printing the Results + +Once the elements are found, they are iterated over and serialized for output. + +```c +for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { + element = lxb_dom_collection_element(collection, i); + serialize_node(lxb_dom_interface_node(element)); +} +``` + +Each element in the collection is retrieved using +`lxb_dom_collection_element` and passed to the `serialize_node` function, which +handles the process of serialization into a string format for printing. + +### Cleaning Up + +Finally, the collection and document are properly destroyed to free up memory. + +```c +lxb_dom_collection_destroy(collection, true); +lxb_html_document_destroy(document); +``` + +## Notes + +- **Memory Management**: Proper memory management is crucial. Ensure that all + created objects are destroyed to prevent memory leaks. +- **Error Handling**: Always check the return status of functions, especially + those that create objects or perform searches, to handle errors gracefully. +- **Collection Size**: The initial size of the collection can be adjusted based + on the expected number of elements to optimize performance. + +## Summary + +This example illustrates how to effectively use the `lexbor` library for +searching and manipulating elements in an HTML document. By understanding how to +parse the document, query elements by class name, and handle them appropriately, +you can leverage `lexbor` for various web scraping or HTML manipulation tasks. +The key takeaway is the efficient and accurate way `lexbor` allows querying and +handling elements based on class names, showcasing its robust capabilities for +document object model manipulation. \ No newline at end of file diff --git a/source/examples/html/elements_by_tag_name.md b/source/examples/html/elements_by_tag_name.md new file mode 100644 index 0000000..9b92288 --- /dev/null +++ b/source/examples/html/elements_by_tag_name.md @@ -0,0 +1,108 @@ +# Extracting Elements by Tag Name + +In this article, we will delve into the [lexbor/html/elements_by_tag_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_tag_name.c) example, +which demonstrates how to extract HTML elements by their tag name using the `lexbor` +library. This specific example focuses on parsing an HTML snippet and then retrieving +all `
` elements from it. We will analyze the different sections of the code to +understand how `lexbor` functions and data types facilitate these operations. + +## Key Code Sections + +### Parsing the HTML Document + +The first significant step in the code is parsing an HTML document using the given +HTML content. + +```c +const lxb_char_t html[] = "
"; +size_t html_szie = sizeof(html) - 1; + +document = parse(html, html_szie); +``` + +The `parse` function takes the HTML content and its size to convert the string into +an `lxb_html_document_t` structure. This document represents the parsed HTML in +memory, allowing further manipulations. + +### Creating and Initializing the Collection + +Next, we need a collection to store the elements we find. `lexbor` provides +mechanisms for creating and managing such collections efficiently. + +```c +collection = lxb_dom_collection_make(&document->dom_document, 128); +if (collection == NULL) { + FAILED("Failed to create Collection object"); +} +``` + +Here, `lxb_dom_collection_make` initializes a collection with a preallocated size of +128 elements. If the creation fails, it returns `NULL`, prompting the program to +exit with an error message. + +### Finding Elements by Tag Name + +The critical function `lxb_dom_elements_by_tag_name` performs the task of finding +all elements with a specific tag name. + +```c +status = lxb_dom_elements_by_tag_name(lxb_dom_interface_element(document->body), + collection, (const lxb_char_t *) "div", 3); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +``` + +In this code snippet: +- `lxb_dom_interface_element(document->body)` converts the body of the document + into a generic element interface. +- `collection` is passed to store the found elements. +- The tag name `"div"` is specified along with its length, `3`. + +If the function fails to find any elements, it returns a status other than +`LXB_STATUS_OK`. + +### Iterating Over and Serializing Found Elements + +Once the elements are found, we iterate over the collection and serialize each node +for display. + +```c +for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { + element = lxb_dom_collection_element(collection, i); + + serialize_node(lxb_dom_interface_node(element)); +} +``` + +We loop through each element in the collection, retrieve it using +`lxb_dom_collection_element`, and then serialize it for output using the +`serialize_node` function. + +### Cleanup + +Proper cleanup of allocated resources is crucial to avoid memory leaks. + +```c +lxb_dom_collection_destroy(collection, true); +lxb_html_document_destroy(document); +``` + +Here, `lxb_dom_collection_destroy` releases the memory for the collection, and +`lxb_html_document_destroy` does the same for the document. + +## Notes + +- The example underscores the importance of checking return values for error + handling. +- It showcases the use of `lxb_dom_elements_by_tag_name` to query elements + efficiently. + +## Summary + +The [lexbor/html/elements_by_tag_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_tag_name.c) example effectively demonstrates how to +parse an HTML document and extract elements by their tag name. Key takeaways include +the importance of proper initialization and error handling, as well as the +simplicity and power of the `lexbor` API for DOM manipulation tasks. This example is +an excellent starting point for developers looking to utilize the `lexbor` library +for web scraping or HTML processing tasks. \ No newline at end of file diff --git a/source/examples/html/encoding.md b/source/examples/html/encoding.md new file mode 100644 index 0000000..811978c --- /dev/null +++ b/source/examples/html/encoding.md @@ -0,0 +1,100 @@ +# Determining HTML Encoding + +The example code in [lexbor/html/encoding.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/encoding.c) demonstrates how to determine the encoding of an HTML file using the `lexbor` library. This example is particularly useful for understanding how to initialize the encoding mechanism and extract the encoding information from the HTML content. + +In this example, the code performs several tasks to determine the HTML encoding. It initializes the HTML encoding detection system, reads the HTML file, and then identifies the encoding used in that file. This process is useful for web scraping, data extraction, and ensuring proper text rendering. The file in question is [lexbor/html/encoding.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/encoding.c). + +## Key Code Sections + +### Main Function and Input Handling + +The program starts with the `main` function, which handles user input and delegates file reading and encoding detection. + +```c +int +main(int argc, const char *argv[]) +{ + size_t len; + lxb_char_t *html; + lxb_status_t status; + lxb_html_encoding_t em; + lxb_html_encoding_entry_t *entry; + + if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); + } + + html = lexbor_fs_file_easy_read((lxb_char_t *) argv[1], &len); + if (html == NULL) { + FAILED(true, "Failed to read file: %s", argv[1]); + } + // ... rest of code ... +} +``` + +Here, the program expects a single argument: the path to the HTML file. It reads the file content using `lexbor_fs_file_easy_read`, which returns the file's content and length. + +### Encoding Initialization + +Next, the program initializes the encoding detection mechanism. + +```c +status = lxb_html_encoding_init(&em); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to init html encoding"); +} +``` + +This part initializes the `lxb_html_encoding_t` structure. If initialization fails, the program exits with an error message. + +### Encoding Determination + +The core logic for determining the encoding follows. + +```c +status = lxb_html_encoding_determine(&em, html, (html + len)); +if (status != LXB_STATUS_OK) { + goto failed; +} + +entry = lxb_html_encoding_meta_entry(&em, 0); +if (entry != NULL) { + printf("%.*s\n", (int) (entry->end - entry->name), entry->name); +} +else { + printf("Encoding not found\n"); +} +``` + +The function `lxb_html_encoding_determine` scans the HTML content to find any encoding declarations. If an encoding is found, it retrieves the encoding entry using `lxb_html_encoding_meta_entry` and prints the encoding name. + +### Error Handling and Cleanup + +In case of errors, the program provides error messages and performs necessary cleanups. + +```c +lexbor_free(html); +lxb_html_encoding_destroy(&em, false); + +return 0; + +failed: + +lexbor_free(html); +lxb_html_encoding_destroy(&em, false); + +FAILED(false, "Failed to determine encoding"); +``` + +Here, `lexbor_free` releases the allocated memory for the HTML content, and `lxb_html_encoding_destroy` cleans up the encoding structure. + +## Notes + +- The example limits the bytes read to the first 1024 to save time, as encoding declarations are typically found early in the HTML. +- It uses `lexbor_fs_file_easy_read` for easy file reading, which abstracts away low-level file operations. +- Proper initialization and cleanup are crucial to avoid memory leaks. + +## Summary + +This example provides a clear, practical demonstration of how to determine the encoding of an HTML file using the `lexbor` library. It covers essential tasks such as initialization, reading file content, detecting encoding, and handling errors. Understanding this example is invaluable for developers needing to ensure correct text processing and rendering in various web-related applications. \ No newline at end of file diff --git a/source/examples/html/html2sexpr.md b/source/examples/html/html2sexpr.md new file mode 100644 index 0000000..3eb116d --- /dev/null +++ b/source/examples/html/html2sexpr.md @@ -0,0 +1,243 @@ +# Converting HTML Tag Tree to S-Expressions + +This article provides an in-depth explanation of the code from the file [lexbor/html/html2sexpr.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/html2sexpr.c). The example demonstrates how to use the `lexbor` library to convert an HTML tag tree into an s-expression string, which is output to `stdout`. It covers the process of reading an HTML file, parsing it into a DOM tree, traversing the tree, and then serializing it into s-expressions. + +## Key Code Sections + +### Main Function Logic + +The `main` function initializes the HTML document, parses the input file, and invokes the traversal and serialization process. The core of the main function is structured as follows: + +```c +int +main(int argc, const char *argv[]) +{ + if (argc != 2) { + usage(); + FAILED("Invalid number of arguments"); + } + + lxb_status_t status; + lxb_html_document_t *document; + lxb_char_t *html; + size_t html_len; + + html = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &html_len); + if (html == NULL) { + FAILED("Failed to read HTML file"); + } + + document = lxb_html_document_create(); + if (document == NULL) { + PRINT("Failed to create HTML Document"); + goto failed; + } + + status = lxb_html_document_parse(document, html, html_len); + if (status != LXB_STATUS_OK) { + PRINT("Failed to parse HTML"); + goto failed; + } + + status = tree_walker(lxb_dom_interface_node(document)->first_child, + serialize_cb, NULL); + if (status != LXB_STATUS_OK) { + PRINT("Failed to convert HTML to S-Expression"); + goto failed; + } + + lxb_html_document_destroy(document); + lexbor_free(html); + + return EXIT_SUCCESS; + +failed: + + lxb_html_document_destroy(document); + lexbor_free(html); + + return EXIT_FAILURE; +} +``` + +In this sequence, the key steps are: +1. **Reading the HTML File**: The `lexbor_fs_file_easy_read` function reads the HTML file and stores its content in `html`. +2. **Document Creation and Parsing**: The document is created using `lxb_html_document_create` and parsed with `lxb_html_document_parse`. +3. **Tree Traversal**: The `tree_walker` is called to traverse the HTML tree and serialize it. + +### Tree Walking and Serialization + +The `tree_walker` function recursively traverses the HTML DOM tree and calls the provided callback to serialize each node and its attributes into s-expressions. + +```c +static lxb_status_t +tree_walker(lxb_dom_node_t *node, lxb_html_serialize_cb_f cb, void *ctx) +{ + lxb_status_t status; + lxb_dom_node_t *root = node->parent; + + const lxb_char_t *name; + size_t name_len = 0; + + while (node != NULL) { + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + status = cb((const lxb_char_t *) "(", 1, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + name = lxb_dom_element_qualified_name(lxb_dom_interface_element(node), + &name_len); + + status = cb(name, name_len, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + status = attributes(node, cb, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + if (node->local_name == LXB_TAG_TEMPLATE) { + lxb_html_template_element_t *temp = lxb_html_interface_template(node); + if (temp->content != NULL && temp->content->node.first_child != NULL) { + status = tree_walker(&temp->content->node, cb, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + } + } + } + + if (node->first_child != NULL) { + node = node->first_child; + } + else { + // Closing tag + while (node != root && node->next == NULL) { + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + status = cb((const lxb_char_t *) ")", 1, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + } + + node = node->parent; + } + + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + status = cb((const lxb_char_t *) ")", 1, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + } + + if (node == root) { + break; + } + + node = node->next; + } + } + + return LXB_STATUS_OK; +} +``` + +This function: +1. **Starts the S-Expression Serialization**: Outputs a `(` followed by the element's name. +2. **Calls `attributes` Function**: Serializes each attribute into s-expressions. +3. **Recursively Processes Template Content**: Handles the special case of `