From 60600dd19388c9e5e66508a6a7d343129dfb3a43 Mon Sep 17 00:00:00 2001 From: Toxypi Date: Sat, 28 Sep 2024 07:30:41 +0100 Subject: [PATCH 1/9] Added examples. --- .../example-CSS-selectors-easy-way.md | 148 ------------------ source/articles/index.md | 11 +- source/examples/css/StyleSheet.md | 110 +++++++++++++ .../examples/css/selectors/list_easy_way.md | 74 +++++++++ .../examples/css/selectors/list_fast_way.md | 49 ++++++ source/examples/css/syntax/simple_colorize.md | 61 ++++++++ .../css/syntax/structure_parse_file.md | 49 ++++++ .../css/syntax/tokenizer/chunks_stdin.md | 117 ++++++++++++++ .../css/syntax/tokenizer/from_file.md | 132 ++++++++++++++++ .../css/syntax/tokenizer/print_raw.md | 104 ++++++++++++ .../examples/encoding/buffer/decode/decode.md | 45 ++++++ .../encoding/buffer/decode/decoder.md | 47 ++++++ .../encoding/buffer/decode/validate.md | 114 ++++++++++++++ .../examples/encoding/buffer/encode/encode.md | 102 ++++++++++++ .../encoding/buffer/encode/encoder.md | 133 ++++++++++++++++ .../encoding/buffer/encode/validate.md | 43 +++++ source/examples/encoding/buffer/from_to.md | 51 ++++++ source/examples/encoding/data_by_name.md | 66 ++++++++ .../examples/encoding/single/decode/decode.md | 122 +++++++++++++++ .../encoding/single/decode/decoder.md | 148 ++++++++++++++++++ .../encoding/single/decode/validate.md | 65 ++++++++ .../examples/encoding/single/encode/encode.md | 36 +++++ .../encoding/single/encode/encoder.md | 70 +++++++++ .../encoding/single/encode/validate.md | 46 ++++++ source/examples/encoding/single/from_to.md | 99 ++++++++++++ source/examples/html/document_parse.md | 90 +++++++++++ source/examples/html/document_parse_chunk.md | 92 +++++++++++ source/examples/html/document_title.md | 89 +++++++++++ source/examples/html/element_attributes.md | 120 ++++++++++++++ source/examples/html/element_create.md | 37 +++++ source/examples/html/element_innerHTML.md | 59 +++++++ source/examples/html/elements_by_attr.md | 131 ++++++++++++++++ .../examples/html/elements_by_class_name.md | 80 ++++++++++ source/examples/html/elements_by_tag_name.md | 115 ++++++++++++++ source/examples/html/encoding.md | 41 +++++ source/examples/html/html2sexpr.md | 89 +++++++++++ source/examples/html/parse.md | 29 ++++ source/examples/html/parse_chunk.md | 79 ++++++++++ source/examples/html/tokenizer/callback.md | 40 +++++ source/examples/html/tokenizer/simple.md | 37 +++++ .../examples/html/tokenizer/tag_attributes.md | 90 +++++++++++ source/examples/html/tokenizer/text.md | 91 +++++++++++ source/examples/index.md | 71 +++++++++ source/examples/punycode/decode.md | 81 ++++++++++ source/examples/punycode/encode.md | 93 +++++++++++ source/examples/selectors/easy_way.md | 37 +++++ source/examples/selectors/normal_way.md | 69 ++++++++ source/examples/selectors/unique_nodes.md | 41 +++++ source/examples/styles/attribute_style.md | 42 +++++ source/examples/styles/events_insert.md | 113 +++++++++++++ source/examples/styles/stylesheet.md | 135 ++++++++++++++++ source/examples/styles/walk.md | 52 ++++++ source/examples/unicode/idna_to_ascii.md | 97 ++++++++++++ source/examples/unicode/normalization_form.md | 54 +++++++ .../unicode/normalization_form_stdin.md | 29 ++++ source/examples/url/parse.md | 87 ++++++++++ source/examples/url/relative.md | 80 ++++++++++ source/index.md | 4 +- 58 files changed, 4276 insertions(+), 160 deletions(-) delete mode 100644 source/articles/example-CSS-selectors-easy-way.md create mode 100644 source/examples/css/StyleSheet.md create mode 100644 source/examples/css/selectors/list_easy_way.md create mode 100644 source/examples/css/selectors/list_fast_way.md create mode 100644 source/examples/css/syntax/simple_colorize.md create mode 100644 source/examples/css/syntax/structure_parse_file.md create mode 100644 source/examples/css/syntax/tokenizer/chunks_stdin.md create mode 100644 source/examples/css/syntax/tokenizer/from_file.md create mode 100644 source/examples/css/syntax/tokenizer/print_raw.md create mode 100644 source/examples/encoding/buffer/decode/decode.md create mode 100644 source/examples/encoding/buffer/decode/decoder.md create mode 100644 source/examples/encoding/buffer/decode/validate.md create mode 100644 source/examples/encoding/buffer/encode/encode.md create mode 100644 source/examples/encoding/buffer/encode/encoder.md create mode 100644 source/examples/encoding/buffer/encode/validate.md create mode 100644 source/examples/encoding/buffer/from_to.md create mode 100644 source/examples/encoding/data_by_name.md create mode 100644 source/examples/encoding/single/decode/decode.md create mode 100644 source/examples/encoding/single/decode/decoder.md create mode 100644 source/examples/encoding/single/decode/validate.md create mode 100644 source/examples/encoding/single/encode/encode.md create mode 100644 source/examples/encoding/single/encode/encoder.md create mode 100644 source/examples/encoding/single/encode/validate.md create mode 100644 source/examples/encoding/single/from_to.md create mode 100644 source/examples/html/document_parse.md create mode 100644 source/examples/html/document_parse_chunk.md create mode 100644 source/examples/html/document_title.md create mode 100644 source/examples/html/element_attributes.md create mode 100644 source/examples/html/element_create.md create mode 100644 source/examples/html/element_innerHTML.md create mode 100644 source/examples/html/elements_by_attr.md create mode 100644 source/examples/html/elements_by_class_name.md create mode 100644 source/examples/html/elements_by_tag_name.md create mode 100644 source/examples/html/encoding.md create mode 100644 source/examples/html/html2sexpr.md create mode 100644 source/examples/html/parse.md create mode 100644 source/examples/html/parse_chunk.md create mode 100644 source/examples/html/tokenizer/callback.md create mode 100644 source/examples/html/tokenizer/simple.md create mode 100644 source/examples/html/tokenizer/tag_attributes.md create mode 100644 source/examples/html/tokenizer/text.md create mode 100644 source/examples/index.md create mode 100644 source/examples/punycode/decode.md create mode 100644 source/examples/punycode/encode.md create mode 100644 source/examples/selectors/easy_way.md create mode 100644 source/examples/selectors/normal_way.md create mode 100644 source/examples/selectors/unique_nodes.md create mode 100644 source/examples/styles/attribute_style.md create mode 100644 source/examples/styles/events_insert.md create mode 100644 source/examples/styles/stylesheet.md create mode 100644 source/examples/styles/walk.md create mode 100644 source/examples/unicode/idna_to_ascii.md create mode 100644 source/examples/unicode/normalization_form.md create mode 100644 source/examples/unicode/normalization_form_stdin.md create mode 100644 source/examples/url/parse.md create mode 100644 source/examples/url/relative.md diff --git a/source/articles/example-CSS-selectors-easy-way.md b/source/articles/example-CSS-selectors-easy-way.md deleted file mode 100644 index f9a3239..0000000 --- a/source/articles/example-CSS-selectors-easy-way.md +++ /dev/null @@ -1,148 +0,0 @@ -# Examples: CSS selectors, the easy way - -Let's start with an easy example of using `lexbor` for parsing and serializing -CSS selectors. This example breaks down the major steps and elements, explaining -the overall purpose, requirements, and assumptions at each step. - -The code for all examples is available in our [GitHub -repository](https://github.com/lexbor/lexbor/tree/master/examples/lexbor); this -specific example can be found at -[list_easy_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/selectors/list_easy_way.c). - - -## Overall Purpose - -The example demonstrates how to use `lexbor` to parse a CSS selector string, -create a selector list, and then serialize the selector list. It also shows how -to handle parser logs and properly clean up allocated resources. - -This guide is designed to help you utilize `lexbor` for parsing and serializing -CSS selectors, with a focus on error handling and resource management. - -Please note that this is a basic (or *naive*) approach. A more advanced, -real-world example will be provided later. - - -## Major Steps and Elements - -### 1. Library Inclusion and Callback Function - -The code includes the necessary header files and defines a callback function -(`callback`) that prints the parsed data. - -```c -#include - -lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) -{ - printf("%.*s", (int) len, (const char *) data); - return LXB_STATUS_OK; -} -``` - -### 2. Main Function - -The `main` function initializes the CSS parser, parses a CSS selector string, -and then serializes the resulting selector list. - -```c -int main(int argc, const char *argv[]) -{ - // ... (variable declarations) - - // Create parser. - parser = lxb_css_parser_create(); - status = lxb_css_parser_init(parser, NULL); - - // Check if parser initialization was successful. - if (status != LXB_STATUS_OK) { - return EXIT_FAILURE; - } - - // Parse and get the log. - // ... - - // Selector List Serialization. - // ... - - // Destroy resources for Parser. - // ... - - // Destroy all Selector List memory. - // ... - - return EXIT_SUCCESS; -} -``` - - -### 3. CSS Selector String and Parser Initialization - -The code defines a CSS selector string (`slctrs`) and initializes the CSS -parser. - -```c -static const lxb_char_t slctrs[] = ":has(div, :not(as, 1%, .class), #hash)"; - -parser = lxb_css_parser_create(); -status = lxb_css_parser_init(parser, NULL); -``` - - -### 4. Parsing CSS Selector and Handling Errors - -The code parses the CSS selector string, checks for parsing errors, and prints -the result. - -```c -list = lxb_css_selectors_parse(parser, slctrs, - sizeof(slctrs) / sizeof(lxb_char_t) - 1); - -if (parser->status != LXB_STATUS_OK) { - printf("Something went wrong\n"); - return EXIT_FAILURE; -} -``` - - -### 5. Selector List Serialization and Handling Logs - -The example serializes the parsed selector list and prints any parser logs. - -```c -printf("Result: "); -(void) lxb_css_selector_serialize_list(list, callback, NULL); -printf("\n"); - -// Check if there are any parser logs. -if (lxb_css_log_length(lxb_css_parser_log(parser)) != 0) { - printf("Log:\n"); - // Serialize parser logs with proper indentation. - (void) lxb_css_log_serialize(parser->log, callback, NULL, - indent, indent_length); - printf("\n"); -} -``` - - -### 6. Resource Cleanup - -Finally, the code destroys resources for the parser and frees memory allocated -for the selector list. - -```c -(void) lxb_css_parser_destroy(parser, true); -lxb_css_selector_list_destroy_memory(list); -``` - - -## Requirements and Assumptions - -Some key points to note: - -- The CSS selector string (`slctrs`) is predefined and used for parsing. -- It is assumed that parser initialization and selector list creation are - successful. -- Error handling is demonstrated by checking the parser's status, though it can - be further improved. -- The cleanup section ensures proper destruction of parser resources and memory. diff --git a/source/articles/index.md b/source/articles/index.md index 82f83e7..078dc8e 100644 --- a/source/articles/index.md +++ b/source/articles/index.md @@ -1,4 +1,4 @@ -# Articles, Examples +# Articles This series of articles discusses various aspects of `lexbor` implementation and design choices. @@ -8,12 +8,3 @@ This series of articles discusses various aspects of `lexbor` implementation and part* ``` - -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. - -```{toctree} -:maxdepth: 1 -:glob: - -example* -``` diff --git a/source/examples/css/StyleSheet.md b/source/examples/css/StyleSheet.md new file mode 100644 index 0000000..45e4b5f --- /dev/null +++ b/source/examples/css/StyleSheet.md @@ -0,0 +1,110 @@ +# CSS Stylesheet Parsing Example + +This article explains the example code within the file `lexbor/css/StyleSheet.c`, which demonstrates how to use the Lexbor library to read and parse a CSS stylesheet. The code showcases the steps required to initialize the parser, read the CSS data from a file, parse the stylesheet, and serialize the resulting object. + +## Code Breakdown + +### Includes and Function Declaration + +The code begins by including the necessary headers: `base.h` for foundational functionalities and `lexbor/core/fs.h` and `lexbor/css/css.h` for file system operations and CSS processing respectively. + +### Callback Function + +A callback function is defined that takes a pointer to character data, its length, and a context pointer as parameters: + +```c +lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { + printf("%.*s", (int) len, data); + return LXB_STATUS_OK; +} +``` + +This function will be used later to output the serialized CSS rules. It prints the data passed to it, formatted to handle the length of the string, ensuring that only the relevant part of the buffer is printed. + +### Main Function + +The `main` function initializes the program and takes one argument: the path to a CSS file. It begins by checking if the number of arguments is correct and printing usage instructions if not: + +```c +if (argc != 2) { + fprintf(stderr, "Usage:\n"); + fprintf(stderr, "\tStyleSheet \n"); + FAILED("Invalid number of arguments"); +} +``` + +### Reading the CSS File + +Next, the code reads the contents of the specified CSS file into memory: + +```c +fl = (const lxb_char_t *) argv[1]; +css = lexbor_fs_file_easy_read(fl, &css_len); +if (css == NULL) { + FAILED("Failed to read CSS file"); +} +``` + +The `lexbor_fs_file_easy_read` function loads the file into the `css` buffer, and the length of the data is stored in `css_len`. If reading the file fails, an error message is displayed. + +### Parsing the CSS + +After successfully loading the CSS data, a CSS parser is created and initialized: + +```c +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + FAILED("Failed to create CSS Parser"); +} +``` + +The parser initialization must succeed; otherwise, the program exits early with an error message. + +### StyleSheet Parsing + +The actual parsing occurs with the following line: + +```c +sst = lxb_css_stylesheet_parse(parser, css, css_len); +``` + +Here, `lxb_css_stylesheet_parse` processes the loaded CSS content and generates a stylesheet object, `sst`. If parsing fails, the program will exit. + +### Memory Management + +Following the parsing step, memory for the CSS buffer is freed, and the parser is destroyed: + +```c +(void) lexbor_free(css); +(void) lxb_css_parser_destroy(parser, true); +``` + +This cleanup is essential to avoid memory leaks in the application. + +### Serializing the Output + +The code then serializes the stylesheet and outputs the rules using the previously defined callback: + +```c +status = lxb_css_rule_serialize(sst->root, callback, NULL); +if (status != LXB_STATUS_OK) { + FAILED("Failed to serialize StyleSheet"); +} +``` + +This process invokes the callback for each rule in the stylesheet, allowing for customizable output handling. + +### Final Cleanup + +Finally, the stylesheet object is destroyed to free up resources: + +```c +(void) lxb_css_stylesheet_destroy(sst, true); +``` + +The program concludes successfully by returning `EXIT_SUCCESS`. + +## Summary + +In this example, a CSS file is read, parsed, and its contents serialized using the Lexbor library. Each significant section of the code has been explained to provide clarity on the parsing process and resource management. By following these steps, developers can incorporate CSS parsing capabilities into their applications using Lexbor. \ No newline at end of file diff --git a/source/examples/css/selectors/list_easy_way.md b/source/examples/css/selectors/list_easy_way.md new file mode 100644 index 0000000..522c695 --- /dev/null +++ b/source/examples/css/selectors/list_easy_way.md @@ -0,0 +1,74 @@ +# CSS Selector Parsing Example + +This article provides an in-depth explanation of the code found in `list_easy_way.c`, which demonstrates how to use the lexbor library for parsing CSS selectors. The code illustrates the steps involved in initializing a parser, parsing a CSS selector string, and handling the results and logs. + +## Code Overview + +The example begins by including the necessary header file from the lexbor CSS library. The main purpose of this code is to showcase the parsing of a CSS selector string, specifically `:has(div, :not(as, 1%, .class), #hash)`, using the lexbor's CSS parser. + +## Key Sections of the Code + +### Callback Function + +The `callback` function is defined to handle output during the serialization process of the CSS selector list. It takes three parameters: a character pointer to the data, the length of that data, and a context pointer. Inside the function, the data is printed to the standard output using `printf`, formatted to respect the length provided. + +```c +lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { + printf("%.*s", (int) len, (const char *) data); + return LXB_STATUS_OK; +} +``` + +### Main Function + +The `main` function begins by declaring variables for the parser and the selector list. It initializes the necessary constants for indentation used in log formatting and specifies the CSS selector string to be parsed. + +#### Parser Initialization + +A parser is created with `lxb_css_parser_create()`, and its initialization is performed with `lxb_css_parser_init()`. The code checks the return status of the initialization and exits gracefully if there is an issue, preventing further execution with an invalid parser instance. + +```c +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + return EXIT_FAILURE; +} +``` + +#### Parsing the Selector + +The parsing of the CSS selector occurs with the function `lxb_css_selectors_parse()`, which takes the parser, the selector string, and its length as arguments. The status of the parser is checked afterward to ensure that the parsing was successful. + +```c +list = lxb_css_selectors_parse(parser, slctrs, + sizeof(slctrs) / sizeof(lxb_char_t) - 1); +if (parser->status != LXB_STATUS_OK) { + printf("Something went wrong\n"); + return EXIT_FAILURE; +} +``` + +#### Selector List Serialization + +The parsed selector list is then serialized using `lxb_css_selector_serialize_list()`, which invokes the previously defined `callback` function. This outputs the result of the serialization to standard output. + +```c +(void) lxb_css_selector_serialize_list(list, callback, NULL); +``` + +### Handling Logs + +If there are any logs generated during parsing, they are checked with `lxb_css_log_length()`, and the log is serialized in a similar manner, making use of the callback function and proper indentation for the displayed log. + +### Cleanup + +Finally, the example demonstrates proper resource management by destroying the parser and the associated memory. This is crucial in C programming to prevent memory leaks. The parser is destroyed first, followed by the cleanup of the selector list's memory. + +```c +(void) lxb_css_parser_destroy(parser, true); +lxb_css_selector_list_destroy_memory(list); +``` + +## Conclusion + +This example effectively showcases the functionality of the lexbor CSS library for parsing CSS selectors. From initializing the parser to handling logs and cleaning up memory, each step is crucial for ensuring that the program runs efficiently and correctly. The structured approach presented in the code promotes good practices in C programming, particularly regarding memory management and error handling. \ No newline at end of file diff --git a/source/examples/css/selectors/list_fast_way.md b/source/examples/css/selectors/list_fast_way.md new file mode 100644 index 0000000..f15671a --- /dev/null +++ b/source/examples/css/selectors/list_fast_way.md @@ -0,0 +1,49 @@ +# CSS Selectors Parsing Example + +This article explains the functionality present in the `list_fast_way.c` source file of the lexbor CSS library, illustrating how to parse CSS selectors effectively. The primary goal of the code is to demonstrate the parsing of various CSS selectors and report the results, including any parsing warnings that may arise. + +## Code Structure Overview + +The entire program is structured around a single function `main`, which is the entry point when the program is executed. Several components of the code are critical to understanding how it prepares for and executes CSS selector parsing. + +### Including Required Headers + +The program begins by including `lexbor/css/css.h`, which is essential as it provides the functions, types, and structures required for working with the lexbor CSS parser. + +### Callback Function + +The `callback` function is defined to handle logging messages that arise during the CSS parsing process. It takes in data and its length, printing the message using `printf`. This function is a straightforward implementation that merely outputs the parsed messages but can be extended for more complex handlers if needed. + +### Main Function Logic + +Inside the `main` function, the following key operations are performed: + +1. **Memory Setup:** + - A memory object is created using `lxb_css_memory_create`, which acts as a buffer for all parsed structures. + - Initialization of memory is conducted with `lxb_css_memory_init`, setting aside an initial block of 128 bytes. + +2. **Parser Initialization:** + - A CSS parser object is instantiated with `lxb_css_parser_create` and initialized using `lxb_css_parser_init`. + +3. **Binding the Memory and Selectors:** + - It is crucial to bind the created memory object to the parser to prevent memory allocation issues during parsing. This is achieved using `lxb_css_parser_memory_set`. + - Similarly, a selectors object is created and initialized. This object must also be bound to the parser, so its data can be managed correctly while parsing CSS selectors. + +### Parsing CSS Selectors + +The program defines a static array of CSS selectors to be parsed. Each selector is processed in a loop, where: + +- The parser attempts to parse each selector using `lxb_css_selectors_parse`. +- The output is assessed to determine if parsing was successful or if there were any warnings or errors. Any issues are logged using the `callback` function, which provides informative feedback. + +### Resource Cleanup + +After all parsing operations, the program ensures to destroy the selectors and parser resources, calling `lxb_css_selectors_destroy` and `lxb_css_parser_destroy`. This step is crucial in managing memory and avoiding leaks in longer-running applications. + +### Serialization of Results + +Finally, the parsed selector lists are serialized and printed. For each selector, the program checks if any parsing results were generated by the `lxb_css_selector_serialize_list` function. If a selector results in an empty list, it is noted accordingly. + +### Conclusion + +The `list_fast_way.c` example serves as a practical guide for developers looking to understand how to parse CSS selectors using the lexbor library. By emphasizing memory management, proper initialization, and error handling, this example lays a solid foundation for further applications of the library in real-world projects. The code harnesses the flexibility of lexbor while maintaining clarity and efficiency in parsing operations, making it an invaluable resource for CSS-related development. \ No newline at end of file diff --git a/source/examples/css/syntax/simple_colorize.md b/source/examples/css/syntax/simple_colorize.md new file mode 100644 index 0000000..f086ee7 --- /dev/null +++ b/source/examples/css/syntax/simple_colorize.md @@ -0,0 +1,61 @@ +# CSS Syntax Parsing Example + +This article provides an explanation of a code example from the source file `lexbor/css/syntax/simple_colorize.c`. The code implements a simple CSS parser that reads a CSS file, parses its content, and provides color-coded output for each type of CSS rule and declaration using ANSI escape codes. + +## Structure of the Program + +The main function serves as the entry point of the program, where the user is expected to provide a CSS file as an argument. The program then reads this file into memory, initializes a CSS parser, and calls a function to parse the CSS content. + +### Key Components + +1. **Initialization and File Handling**: + - The program checks for the correct number of command-line arguments. + - It leverages the `lexbor_fs_file_easy_read` function to read the CSS content from the specified file into a buffer. + +2. **CSS Parser Setup**: + - It creates an instance of a CSS parser using `lxb_css_parser_create`. + - The parser is then initialized with `lxb_css_parser_init`. + +3. **CSS Parsing Function**: + - The function `css_parse` is called, which sets up the parsing context and starts the rule parsing process. + +4. **Token Handling**: + - Several callback functions are defined to handle the various types of CSS syntax tokens, including qualified rules, at-rules, and declaration blocks. + +## Detailed Code Explanation + +### CSS Parsing Function (`css_parse`) + +The `css_parse` function initializes a context structure `css_ctx_t`, which tracks the current offset within the CSS data while parsing. It sets the parsing buffer using `lxb_css_parser_buffer_set` and begins the rule parsing using `lxb_css_syntax_parser_list_rules_push`. + +The call to `lxb_css_syntax_parser_run` runs the parser, which processes the CSS tokens based on the rules specified. This function returns a status that indicates whether the parsing succeeded or failed. + +### Token Callbacks + +The program defines various inline functions and callbacks to handle the output of tokens during parsing: + +- **`css_print_token`** and **`css_print_token_offset`**: These functions print a CSS token along with proper formatting. They utilize ANSI escape codes to change text color in the console output for better visualization. + +### Rule Handling + +The parser is equipped with callbacks for handling different CSS rules: + +- **`css_list_rules_state`**: This function handles the state of list rules and is responsible for printing the state with a specific color. + +- **`css_at_rule_state`** and **`css_at_rule_block`**: These handle at-rules and their blocks, printing the corresponding tokens and managing the nested structure of CSS. + +- **`css_qualified_rule_state`** and **`css_qualified_rule_block`**: Manage the parsing of qualified rules and their associated declaration blocks, printing relevant information while maintaining contextual awareness of the current location within the CSS input. + +### Declarations Handling + +The parsing of declarations involves several parts: + +- **`css_declarations_name`** and **`css_declarations_value`**: Handle the CSS property names and values, respectively, printing them in different colors to distinguish visually between different parts of declarations. + +### Memory Management + +The code ensures to clean up the allocated memory for the CSS data buffer and parser instance by calling `lexbor_free` and `lxb_css_parser_destroy`, which prevents memory leaks. + +## Conclusion + +This example illustrates how to implement a simple CSS parser that reads a file, processes its content into structured tokens, and outputs the result with visual cues. The use of callback functions and context structures allows for flexible and extendable parsing logic, suitable for more complex scenarios in CSS syntax processing. \ No newline at end of file diff --git a/source/examples/css/syntax/structure_parse_file.md b/source/examples/css/syntax/structure_parse_file.md new file mode 100644 index 0000000..6ada7e9 --- /dev/null +++ b/source/examples/css/syntax/structure_parse_file.md @@ -0,0 +1,49 @@ +# CSS Syntax Parser Example + +This article provides an overview of the code located in `lexbor/css/syntax/structure_parse_file.c`, which implements a CSS syntax parser using the lexbor library. The primary goal of this code is to parse CSS syntax rules and declarations, handling various states and transitions within the parsing process. + +## Code Overview + +The code starts with the inclusion of headers that bring in necessary definitions and functions from the lexbor library. It defines multiple functions and callback structures that manage the parsing of different CSS constructs. Central to the code is the `main` function, which serves as the entry point of the application. + +### Main Function + +The `main` function performs several key operations: + +1. **Argument Validation**: It checks if the number of command-line arguments is correct. If not, it prints usage instructions and exits the program. + +2. **File Reading**: It reads a CSS file specified by the user and stores its contents into a variable `css`. If this reading fails, the program exits with an error message. + +3. **Parser Initialization**: It creates and initializes a CSS parser instance. If the initialization fails, the program reports an error and exits. + +4. **Parsing Execution**: The `css_parse` function is called with the parser and the CSS data to carry out the parsing process. + +5. **Cleanup**: After the parsing is done, it releases allocated resources and exits with success or failure status based on the parsing outcome. + +### CSS Parsing Implementation + +The `css_parse` function is crucial as it sets up the parsing buffer and pushes the initial parsing rules onto a stack. Here’s a breakdown of its functionality: + +- **Set Buffer**: The parsing buffer of the parser is set with the provided CSS data and its length. + +- **Push Rules**: The function uses the `lxb_css_syntax_parser_list_rules_push` to initiate the parsing of list rules, which is a fundamental construct in CSS. It expects a pointer to a set of callback functions that manage how the list of rules is processed. + +- **Run Parser**: Finally, it triggers the parsing process with `lxb_css_syntax_parser_run`, which advances through the tokens available in the CSS data. + +### Callback Functions + +The code defines a series of callback functions that manage specific CSS rules, states, and declarations: + +- **State Management**: Functions like `css_list_rules_state`, `css_at_rule_state`, and `css_declarations_name` handle specific parser states. Each of these functions typically logs the current processing step and processes tokens of interest. They return a success status after handling the tokens. + +- **Handling Blocks**: Functions such as `css_at_rule_block` and `css_qualified_rule_block` manage blocks of CSS rules, utilizing the `css_consule_tokens` function to process tokens within those blocks. These functions also handle stack manipulations depending on the rule context, such as pushing or popping a stack. + +- **End States**: Functions like `css_list_rules_end` and `css_declarations_end` signal the completion of various sections. These may log end messages or perform any necessary cleanup. + +### Additional Utility Functions + +The utility function `css_consule_tokens` is noteworthy. It iterates through tokens and processes each one sequentially, calling `lxb_css_syntax_token_serialize`, which presumably serializes or logs the token data. This function also handles token consumption, facilitating smooth progress through the parsing state. + +### Conclusion + +The code contained in `structure_parse_file.c` offers a comprehensive implementation of a CSS syntax parser with well-defined states and callbacks. The use of systematic error handling and resource management provides stability to the parsing process. By integrating these components, the lexbor library enhances its ability to interpret and manipulate CSS effectively. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/chunks_stdin.md b/source/examples/css/syntax/tokenizer/chunks_stdin.md new file mode 100644 index 0000000..a387c75 --- /dev/null +++ b/source/examples/css/syntax/tokenizer/chunks_stdin.md @@ -0,0 +1,117 @@ +# CSS Syntax Tokenizer Example + +This article explains the implementation of a CSS syntax tokenizer in the file `lexbor/css/syntax/tokenizer/chunks_stdin.c`. The code demonstrates how to read CSS data from standard input, tokenize it, and output the identified token types along with their serialized representations. + +## Overview + +The main purpose of this example is to showcase the mechanics of the `lxb_css_syntax_tokenizer`, a component provided by the Lexbor library for parsing CSS syntax. The example leverages standard input (stdin) to read CSS input, processes the tokens through the tokenizer, and outputs details about each token to the console. + +## Code Breakdown + +### Includes and Definitions + +At the beginning of the file, necessary headers are included, such as `lexbor/css/css.h`, which contains the definitions and interfaces for the CSS parser. A small buffer size of 32 bytes is defined with `#define BUFFER_SIZE 32`, which limits the amount of data read from stdin at one time, making it suitable for demonstration purposes. + +### Callback Function + +The `callback` function is defined to handle the serialized output of the tokens: + +```c +lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { + printf("%s", (const char *) data); + return LXB_STATUS_OK; +} +``` + +This function prints the serialized token data to the console and returns a status indicating success. It serves as a simple mechanism to display token information during parsing. + +### Chunk Callback Function + +The `chunk_cb` function reads chunks of CSS data into a buffer and sets up the tokenizer to consume these chunks: + +```c +lxb_status_t chunk_cb(lxb_css_syntax_tokenizer_t *tkz, const lxb_char_t **data, const lxb_char_t **end, void *ctx) { + size_t size; + lxb_char_t *buff = ctx; + + size = fread((char *) buff, 1, BUFFER_SIZE, stdin); + if (size != BUFFER_SIZE) { + if (feof(stdin)) { + tkz->eof = true; + } else { + return EXIT_FAILURE; + } + } + + *data = buff; + *end = buff + size; + + return LXB_STATUS_OK; +} +``` + +The function first attempts to read a buffer full of CSS data from stdin. If the end of input is reached, it marks the tokenizer's end-of-file (EOF) state. If an error occurs during reading, it returns a failure status. The function effectively prepares the data for the tokenizer by updating the pointed `data` and `end` pointers. + +### Main Function + +The `main` function orchestrates the initialization and the execution of the CSS syntax tokenizer: + +```c +int main(int argc, const char *argv[]) { + lxb_status_t status; + lxb_css_syntax_token_t *token; + lxb_css_syntax_tokenizer_t *tkz; + lxb_css_syntax_token_type_t type; + const lxb_char_t *name; + char inbuf[BUFFER_SIZE]; + + tkz = lxb_css_syntax_tokenizer_create(); + status = lxb_css_syntax_tokenizer_init(tkz); + if (status != LXB_STATUS_OK) { + PRINT("Failed to create CSS:Syntax parser"); + goto failed; + } + + lxb_css_syntax_tokenizer_chunk_cb_set(tkz, chunk_cb, inbuf); +``` + +This section starts by creating and initializing a tokenizer instance. If initialization fails, it gracefully exits the process. Notably, it sets the chunk callback function, associating it with the previously defined `chunk_cb` and the input buffer `inbuf`. + +#### Token Processing Loop + +The main loop processes tokens until the EOF is reached: + +```c +do { + token = lxb_css_syntax_token(tkz); + if (token == NULL) { + PRINT("Failed to parse CSS"); + goto failed; + } + + name = lxb_css_syntax_token_type_name_by_id(token->type); + printf("%s: ", (const char *) name); + + lxb_css_syntax_token_serialize(token, callback, NULL); + printf("\n"); + + type = lxb_css_syntax_token_type(token); + lxb_css_syntax_token_consume(tkz); +} while (type != LXB_CSS_SYNTAX_TOKEN__EOF); +``` + +In this loop, it retrieves the next token from the tokenizer and checks for parsing errors. If a token is successfully obtained, it retrieves and prints the token's type name, serializes the token using the earlier defined `callback`, and then consumes the token to prepare for the next cycle. This loop continues until an EOF token is encountered. + +### Cleanup + +At the end of the function, the tokenizer is destroyed to free up allocated resources: + +```c +lxb_css_syntax_tokenizer_destroy(tkz); +``` + +If any failures occur at various stages, the code ensures proper cleanup to avoid memory leaks. + +## Conclusion + +This example illustrates how to implement a simple CSS syntax tokenizer using the Lexbor library, allowing for parsing CSS input from stdin and outputting token information. Anyone looking to understand or extend CSS parsing functionality can use this code as a foundation for further development. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/from_file.md b/source/examples/css/syntax/tokenizer/from_file.md new file mode 100644 index 0000000..dbd336a --- /dev/null +++ b/source/examples/css/syntax/tokenizer/from_file.md @@ -0,0 +1,132 @@ +# CSS Syntax Tokenizer Example + +This article provides a detailed explanation of a CSS syntax tokenizer implemented in the file `lexbor/css/syntax/tokenizer/from_file.c`. The code serves the purpose of reading a CSS file, processing its contents to extract tokens, and producing output that describes each token. + +## Overview + +The main function of the tokenizer is to parse CSS code from a file, generate tokens for syntactic analysis, and then invoke a callback function to handle the output of each token. The program efficiently handles input and organizes the parsing process with the help of the lexbor library. + +## Code Breakdown + +### Includes and Utility Functions + +At the beginning of the file, necessary libraries are included: + +```c +#include +#include +``` + +The first include provides access to CSS-related functionality within the lexbor library, whereas the second includes core file system operations needed to read the CSS file. + +A utility function `usage` is defined to provide a simple usage instruction: + +```c +static void usage(void) +{ + fprintf(stderr, "parse_file \n"); +} +``` + +This function prints an error message when the user does not provide the correct number of arguments. + +### Main Function Logic + +The entry point of the program is the `main` function, which processes command-line arguments and orchestrates the tokenization process: + +```c +int main(int argc, const char *argv[]) +``` + +#### Argument Validation + +At the start of the main function, the program checks whether exactly one command-line argument (the CSS file name) has been provided: + +```c +if (argc != 2) { + usage(); + FAILED("Invalid number of arguments"); +} +``` + +If not, it calls the `usage` function and exits with an error. + +#### File Reading + +Next, the code attempts to read the specified CSS file: + +```c +css = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &css_len); +if (css == NULL) { + FAILED("Failed to read CSS file"); +} +``` + +The `lexbor_fs_file_easy_read` function reads the entire file into memory, and if it fails, the program reports the error and exits. + +#### Tokenizer Initialization + +The tokenizer is created and initialized: + +```c +tkz = lxb_css_syntax_tokenizer_create(); +status = lxb_css_syntax_tokenizer_init(tkz); +``` + +These lines allocate memory for the tokenizer and perform any necessary setup. If initialization fails, an error message is printed, and the program proceeds to cleanup. + +#### Setting Input Buffer + +Next, the contents of the CSS file are set as the input buffer for the tokenizer: + +```c +lxb_css_syntax_tokenizer_buffer_set(tkz, css, css_len); +``` + +This prepares the tokenizer to begin processing the CSS data. + +### Tokenization Loop + +The program enters a loop to process the tokens extracted from the input buffer: + +```c +do { + token = lxb_css_syntax_token(tkz); + if (token == NULL) { + PRINT("Failed to parse CSS"); + goto failed; + } + + name = lxb_css_syntax_token_type_name_by_id(token->type); + printf("%s: ", (const char *) name); + + lxb_css_syntax_token_serialize(token, callback, NULL); + printf("\n"); + + type = lxb_css_syntax_token_type(token); + + lxb_css_syntax_token_consume(tkz); +} +while (type != LXB_CSS_SYNTAX_TOKEN__EOF); +``` + +#### Token Extraction + +Within the loop, the function `lxb_css_syntax_token` retrieves a token. If no token is available, it reports a parsing failure. Upon successful token retrieval, it prints the type name of the token followed by calling `lxb_css_syntax_token_serialize`, which uses the provided `callback` function to output the token data. + +The type of the current token is acquired to determine if the end of the file (EOF) has been reached. If the EOF is not reached, the loop continues to consume tokens. + +### Cleanup and Exit + +When the entire CSS file has been processed, resources are cleaned up: + +```c +lxb_css_syntax_tokenizer_destroy(tkz); +lexbor_free(css); +``` + +Finally, the program returns `EXIT_SUCCESS` if the execution was successful, or `EXIT_FAILURE` in case of any errors during the process. + +## Conclusion + +The CSS syntax tokenizer effectively reads and parses a CSS file, extracting and displaying token details by utilizing the lexbor library's API for CSS processing. This example demonstrates not only the functionality of lexer-based parsing but also highlights memory management and error handling within a complex system. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/print_raw.md b/source/examples/css/syntax/tokenizer/print_raw.md new file mode 100644 index 0000000..22c15df --- /dev/null +++ b/source/examples/css/syntax/tokenizer/print_raw.md @@ -0,0 +1,104 @@ +# CSS Syntax Tokenizer Example + +This article provides an overview of the `print_raw.c` source file, which implements a simple command-line tool for tokenizing CSS syntax using the Lexbor library. The primary purpose of this code is to read a CSS file, tokenize its contents, and print the tokens to the standard output. + +## Breakdown of Major Code Sections + +### Usage Function + +The `usage` function is defined to inform users about how to execute the program properly. It outputs a simple message stating that the tool requires one argument, which is the name of the file to process: + +```c +static void +usage(void) +{ + fprintf(stderr, "print_raw \n"); +} +``` + +This function is called when the number of command line arguments (`argv`) provided is incorrect. It helps to guide users in using the tool correctly. + +### Main Function Logic + +The `main` function serves as the entry point of the program. It starts by checking if the user has provided exactly one argument: + +```c +if (argc != 2) { + usage(); + FAILED("Invalid number of arguments"); +} +``` + +If this condition is not met, the `usage` function is invoked to display the correct usage. The `FAILED` macro indicates an error state, although its definition is not shown in this excerpt. + +### Reading the CSS File + +The next step involves reading the CSS file specified by the user. The function `lexbor_fs_file_easy_read` attempts to read the file into memory: + +```c +css = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &css_len); +if (css == NULL) { + FAILED("Failed to read CSS file"); +} +``` + +If the reading process fails, the program terminates by invoking the `FAILED` macro once again to report the issue. + +### Tokenization Process + +The tokenization process begins with the creation of a tokenizer instance: + +```c +tkz = lxb_css_syntax_tokenizer_create(); +status = lxb_css_syntax_tokenizer_init(tkz); +``` + +After creating the tokenizer, it is initialized with the `lxb_css_syntax_tokenizer_init` function. If the initialization does not succeed, an error message is printed, and the program enters the cleanup phase. + +The following block of code sets the tokenizer's buffer to contain the CSS content read from the file: + +```c +tkz->with_comment = true; + +lxb_css_syntax_tokenizer_buffer_set(tkz, css, css_len); +``` + +The `with_comment` flag indicates whether comments should be included in the tokenization process. + +### Processing Tokens + +The main loop of the `main` function processes the tokens generated by the tokenizer: + +```c +do { + token = lxb_css_syntax_token(tkz); + if (token == NULL) { + PRINT("Failed to parse CSS"); + goto failed; + } + + colorize_cb(token); + + type = lxb_css_syntax_token_type(token); + + lxb_css_syntax_token_consume(tkz); +} +while (type != LXB_CSS_SYNTAX_TOKEN__EOF); +``` + +Within this loop, a token is fetched, and if it cannot be retrieved, an error message is printed. The `colorize_cb` function is called to handle the output for each token. After processing the token, its type is checked, and it is consumed for the next iteration. + +### Cleanup Phase + +After all tokens have been processed, the program cleans up by destroying the tokenizer instance and freeing any allocated memory: + +```c +lxb_css_syntax_tokenizer_destroy(tkz); +lexbor_free(css); +``` + +Finally, if no errors occurred during processing, the program returns `EXIT_SUCCESS`. In case of failure, it follows a similar cleanup procedure but returns `EXIT_FAILURE`. + +## Conclusion + +The `print_raw.c` implementation demonstrates how to leverage the Lexbor library for CSS syntax tokenization. By following a structured approach, it effectively reads CSS content, processes it into tokens, and provides robust error handling. This example serves as a foundation for further exploration of CSS parsing and analysis using Lexbor. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decode.md b/source/examples/encoding/buffer/decode/decode.md new file mode 100644 index 0000000..d127104 --- /dev/null +++ b/source/examples/encoding/buffer/decode/decode.md @@ -0,0 +1,45 @@ +# UTF-8 Decoding Example + +In this article, we will explore a code example from the file `lexbor/encoding/buffer/decode/decode.c` that demonstrates how to decode a UTF-8 encoded string into code points using the Lexbor library. This example specifically highlights the usage of Lexbor's encoding functionalities, providing insights into how to leverage these features for character decoding in C. + +## Code Explanation + +The code begins by including the necessary header files. It specifically includes `lexbor/encoding/encoding.h`, which contains the declarations needed for encoding and decoding operations. The definition of the `FAILED` macro is also provided, which facilitates error handling by printing an error message to `stderr` and terminating the program if an error occurs. + +### Main Function + +The `main` function serves as the entry point of our program, where we will set up the decoding of a UTF-8 encoded string. + +#### Variable Declarations + +Within the `main` function, several important variables are declared: + +- `buf_length`: To store the length of the decoded buffer. +- `status`: To hold the status of operations, indicated by the `lxb_status_t` type. +- `cp`: An array of `lxb_codepoint_t` to hold the decoded code points. +- `decode`: An instance of `lxb_encoding_decode_t`, which manages the decoding process. +- `encoding`: A pointer to the encoding data. + +Next, we prepare the buffer that contains the UTF-8 string "Привет, мир!" (which translates to "Hello, World!"). The buffer is defined as `data`, and `end` is set to point to the end of the string using `strlen`. + +#### Initialization + +The initialization process is crucial for setting up the decoder. We call `lxb_encoding_data(LXB_ENCODING_UTF_8)` to get the encoding data for UTF-8. Then, we initialize the decoder using `lxb_encoding_decode_init`, passing the decoder instance, encoding, the code point array, and its capacity. + +If this initialization fails, the `FAILED` macro is triggered, notifying us with an error message and stopping the program. + +#### Decoding Process + +After successful initialization, we print the original UTF-8 string to the console. The actual decoding is carried out by calling the `decode` function through the `encoding` pointer. The function decodes the string pointed to by `data` up to its `end`, storing the results in the `cp` array. + +In this context, an error during decoding is not expected. Therefore, the code contains a comment indicating that such a situation cannot occur in this example, underlining the robustness of the decoding function for the given input. + +#### Output and Conclusion + +Finally, we calculate the length of the buffer used in the decoding process with `lxb_encoding_decode_buf_used(&decode)` and print each decoded code point in hexadecimal format. + +The program concludes with a return statement indicating successful execution. + +## Summary + +This example effectively illustrates how to decode a UTF-8 string into individual code points using the Lexbor library. It emphasizes the initialization of the decoding context, error handling strategies, and the process of translating encoded UTF-8 data into usable character representations. Through careful management of buffers and decoding functions, developers can build robust applications that accurately handle multi-byte character sets. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decoder.md b/source/examples/encoding/buffer/decode/decoder.md new file mode 100644 index 0000000..2a57f61 --- /dev/null +++ b/source/examples/encoding/buffer/decode/decoder.md @@ -0,0 +1,47 @@ +# Unicode Decoder Example + +In this article, we will discuss a simple Unicode decoder implemented in C, specifically within the context of the lexbor library. The code can be found in the source file `lexbor/encoding/buffer/decode/decoder.c`. This program is designed to take a specified character encoding from the command line, read input data, and decode it into Unicode code points, displaying the result in a format suitable for further processing or representation. + +## Code Structure Overview + +The code begins with the necessary includes, defines, and utility functions required for the decoder's operation. Key components include error handling, usage instructions, and the main decoding loop. + +### Error Handling Macro + +The `FAILED` macro is defined to streamline error reporting throughout the code. It takes a boolean indicating if usage should be displayed, followed by a formatted message. If an error occurs, this macro outputs the error message to standard error and, if requested, invokes the `usage()` function to display acceptable encoding options. + +### Usage Function + +The `usage` function is a simple utility that displays how the program should be invoked and lists the character encodings that the decoder supports. This function becomes crucial when the user fails to provide the expected arguments. + +### Main Function Logic + +The `main` function serves as the entry point of the application. It handles argument parsing, encoding determination, and the initialization of the decoding process. + +#### Argument Parsing + +The program checks if exactly one argument (the encoding name) has been provided. If not, it calls the `usage()` function and exits gracefully. + +#### Encoding Retrieval + +Next, it uses the `lxb_encoding_data_by_pre_name` function to retrieve the encoding data based on the provided encoding name. If the encoding cannot be determined, the `FAILED` macro is invoked with appropriate error handling. + +#### Decoder Initialization + +Once the encoding is acquired, the decoder is initialized using `lxb_encoding_decode_init`. It also sets up a buffer for any replacement characters that may need to be utilized during the decoding process. Each initialization step includes error checking to ensure the decoder is prepared for processing the input data. + +### Decoding Loop + +The main decoding operation occurs within a loop that reads data from standard input. The program continuously reads chunks of data into a buffer (`inbuf`) until the end of the input is reached. + +#### Buffer Processing + +For each chunk of data read, the program decodes the input using the encoding's decode function. It iterates over the decoded results, determining whether each code point is an ASCII character or a Unicode character. The output format uses a hexadecimal representation for both types of characters, with Unicode points prefixed by `\u` and ASCII points by `\x`. + +#### Finalizing Decoding + +After all input data has been processed, the decoder's `finish` function is called. This function ensures that any remaining code points, particularly those that could not be fully processed, are correctly handled. The remaining code points are then printed if any exist in the output buffer. + +## Conclusion + +This `decoder.c` example illustrates the practical use of the lexbor library for handling various character encodings and converting them into a clear, usable form. By leveraging the available utility functions and error handling methods, the code provides a robust framework for decoding inputs in a specified encoding, making it valuable for any application that requires processing text in diverse formats. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/validate.md b/source/examples/encoding/buffer/decode/validate.md new file mode 100644 index 0000000..78826be --- /dev/null +++ b/source/examples/encoding/buffer/decode/validate.md @@ -0,0 +1,114 @@ +# UTF-8 Decoding and Replacement Example + +This article will explain a C code example that demonstrates UTF-8 decoding and the handling of invalid byte sequences using the lexbor library. The source file for the example is `lexbor/encoding/buffer/decode/validate.c`. + +## Overview + +The provided code illustrates how to initialize a decoder for UTF-8 encoded strings and replace any invalid byte sequences with specified replacement code points. This is accomplished utilizing the lexbor encoding API. + +## Code Breakdown + +### Including Necessary Headers + +At the start of the code, the relevant header file from the lexbor library is included: + +```c +#include +``` + +This inclusion is necessary as it provides the required declarations and definitions for encoding operations performed later in the code. + +### Defining a Macro for Error Handling + +A macro named `FAILED` is defined to handle errors gracefully: + +```c +#define FAILED(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + exit(EXIT_FAILURE); \ + } \ + while (0) +``` + +This macro uses `fprintf` to print error messages to standard error and then exits the program with `EXIT_FAILURE`. It helps streamline error reporting throughout the code. + +### Main Function and Buffer Preparation + +The main function initializes several variables, including a buffer for decoded code points and an instance of the decoder: + +```c +int main(int argc, const char *argv[]) { + size_t buf_length; + lxb_status_t status; + lxb_codepoint_t cp[32]; + lxb_encoding_decode_t decode; + const lxb_encoding_data_t *encoding; + + const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; + const lxb_char_t *end = data + strlen((char *) data); +``` + +In this segment, a buffer `cp` is defined to hold up to 32 decoded code points. The `data` variable contains a UTF-8 string that includes an invalid byte (`\x80`). The `end` variable calculates the pointer to the end of the `data`. + +### Initializing the Decoder + +The code initializes the decoder for UTF-8 using: + +```c +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); +status = lxb_encoding_decode_init(&decode, encoding, cp, + sizeof(cp) / sizeof(lxb_codepoint_t)); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialization decoder"); +} +``` + +Here, `lxb_encoding_data` retrieves the encoding data for UTF-8. The `lxb_encoding_decode_init` function sets up the decoder with the encoding information and the previously defined buffer for decoded code points. If initialization fails, the `FAILED` macro is invoked. + +### Configuring Replacement Settings + +Next, the code sets up settings for replacing invalid byte sequences: + +```c +status = lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, + LXB_ENCODING_REPLACEMENT_BUFFER_LEN); +if (status != LXB_STATUS_OK) { + FAILED("Failed to set replacement code points for decoder"); +} +``` + +This step allows the decoder to specify how to handle invalid sequences by using the replacement character defined in lexbor. Again, the error handling is consistent throughout. + +### Decoding the Input String + +The actual decoding is performed with the following: + +```c +status = encoding->decode(&decode, &data, end); +if (status != LXB_STATUS_OK) { + /* In this example, this cannot happen. */ +} +``` + +This line invokes the decoding process, moving through the input string from `data` to `end`. The decoder attempts to handle any valid sequences and replaces any invalid sequences as configured earlier. + +### Outputting the Decoded Values + +Finally, the decoded code points are printed: + +```c +buf_length = lxb_encoding_decode_buf_used(&decode); + +for (size_t i = 0; i < buf_length; i++) { + printf("0x%04X\n", cp[i]); +} +``` + +Here, `lxb_encoding_decode_buf_used` retrieves the number of valid code points decoded. Then, a loop iterates over each code point in the buffer, printing the hexadecimal representation. + +## Conclusion + +This example effectively showcases the use of the lexbor library for decoding UTF-8 strings while managing potentially invalid byte sequences. By initializing the decoder, setting up replacement strategies, and decoding the input string, the program demonstrates a robust method for handling encoding issues in C. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encode.md b/source/examples/encoding/buffer/encode/encode.md new file mode 100644 index 0000000..50d58b5 --- /dev/null +++ b/source/examples/encoding/buffer/encode/encode.md @@ -0,0 +1,102 @@ +# Encoding Unicode Code Points to UTF-8 Example + +This article explains the encoding of Unicode code points to a UTF-8 byte string using the Lexbor library. The source code is located in `lexbor/encoding/buffer/encode/encode.c`. This example demonstrates how to initialize the encoder, encode Unicode code points, and handle the output appropriately. + +## Overview + +The primary purpose of this code is to convert an array of Unicode code points into a UTF-8 encoded string. The code includes error handling, memory allocation for the output buffer, and final output printing. + +## Code Explanation + +### Includes and Macros + +The code begins with the inclusion of the `lexbor/encoding/encoding.h` header file, which provides necessary functions and definitions for encoding operations. A macro called `FAILED` is defined to handle error reporting: + +```c +#define FAILED(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + exit(EXIT_FAILURE); \ + } \ + while (0) +``` + +This macro simplifies the error handling by printing an error message to `stderr` and exiting the program if there is a failure during initialization. + +### Main Function + +The `main` function initializes several variables and prepares to encode the Unicode code points: + +```c +int main(int argc, const char *argv[]) +{ + lxb_status_t status; + lxb_encoding_encode_t encode; + const lxb_codepoint_t *cps_ref, *cps_end; + const lxb_encoding_data_t *encoding; + + /* Prepare buffer */ + lxb_char_t buffer[1024]; +``` + +In this section, a buffer of 1024 characters is created to hold the encoded byte string. The `lxb_codepoint_t` array contains several predefined Unicode code points. + +### Unicode Code Points + +The code points initialized in the `cps` array represent Cyrillic characters and symbols: + +```c +lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, + 0x002C, 0x0020, 0x043C, 0x0438, 0x0440, 0x0021}; +``` + +### Encoder Initialization + +The encoding data for UTF-8 is retrieved and the encoder is initialized with: + +```c +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); +status = lxb_encoding_encode_init(&encode, encoding, buffer, sizeof(buffer)); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialization encoder"); +} +``` + +Here, `lxb_encoding_data` retrieves encoding information for UTF-8, and `lxb_encoding_encode_init` initializes the encoding context. If the initialization fails, the `FAILED` macro is invoked. + +### Encoding Process + +Next, the code encodes the Unicode code points: + +```c +status = encoding->encode(&encode, &cps_ref, cps_end); +if (status != LXB_STATUS_OK) { + /* In this example, this cannot happen. */ +} +``` + +This line calls the `encode` function from the `encoding` structure, which encodes the code points from `cps_ref` to `cps_end`. + +### Output Preparation + +After encoding, the buffer is terminated with a null character: + +```c +buffer[ lxb_encoding_encode_buf_used(&encode) ] = 0x00; +``` + +### Printing Results + +Finally, the result is displayed: + +```c +printf("\nResult: %s\n", (char *) buffer); +``` + +This prints the encoded UTF-8 string to standard output along with the original Unicode values shown in hexadecimal format. + +## Conclusion + +This code example effectively demonstrates the usage of the Lexbor encoding library for converting Unicode code points to a UTF-8 encoded string. It emphasizes proper initialization, error handling, and output formatting, which are essential for working with character encoding in C programming. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encoder.md b/source/examples/encoding/buffer/encode/encoder.md new file mode 100644 index 0000000..a0db86d --- /dev/null +++ b/source/examples/encoding/buffer/encode/encoder.md @@ -0,0 +1,133 @@ +# Encoder Example + +This article provides an explanation of the `encoder.c` source file located in the `lexbor/encoding/buffer/encode` directory. The intent of the code is to implement a command-line utility that encodes input data based on the specified character encoding name. The encoder processes Standard Input, converts it based on escape sequences into code points, and outputs the encoded data to Standard Output. + +## Code Structure and Major Sections + +### Header and Includes + +At the beginning of the file, there are several include statements that bring in necessary libraries: + +```c +#include +#include +#include +#include +``` + +These headers allow access to string manipulation functions, standard input/output functionalities, and the defined encoding structures and functions within the `lexbor` library. + +### Error Handling + +The `FAILED` macro is defined to streamline error handling within the code. It prints an error message and usage instructions when an issue occurs: + +```c +#define FAILED(with_usage, ...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + if (with_usage) { \ + usage(); \ + } \ + \ + exit(EXIT_FAILURE); \ + } \ + while (0) +``` + +This macro takes a boolean flag to determine if usage instructions should be displayed before exiting. This ensures that any critical failures can inform users about incorrect command usage. + +### Usage Function + +The `usage` function provides a simple guide on how to run the encoder, listing available encodings. It helps users understand the valid options to include when calling the program: + +```c +static void usage(void) +{ + printf("Usage: encoder \n\n"); + printf("Available encodings:\n"); + // List of encodings... +} +``` + +### Main Function + +The `main` function is the core of the program, where execution begins. It handles command-line arguments, initializes encoding setups, reads from Standard Input, and writes the encoded data to Standard Output. + +#### Command-Line Argument Handling + +The program expects one argument - the encoding name. If this is not provided, the `usage` function is invoked: + +```c +if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); +} +``` + +#### Encoding Initialization + +The encoding is determined using the `lxb_encoding_data_by_pre_name` function, which fetches the encoding data associated with the provided name. If it fails, it reports an error: + +```c +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); +if (encoding == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n", argv[1]); +} +``` + +After determining the encoding, the encoder is initialized with `lxb_encoding_encode_init`: + +```c +status = lxb_encoding_encode_init(&encode, encoding, outbuf, sizeof(outbuf)); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to initialization encoder"); +} +``` + +This sets up a buffer for output based on the specified encoding type. + +### Data Encoding Loop + +The heart of the encoding process is found in a `do-while` loop that reads from stdin and encodes the input data: + +```c +do { + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + // Encoding logic... +} while (loop); +``` + +If the end of the file is reached on standard input (`feof(stdin)`), the loop breaks, indicating that no more data is available. + +#### Escaped Code Points Conversion + +The `escaped_to_codepoint` function handles the conversion of escape sequences (e.g., '\x41' for 'A') into code points that can be processed. The logic checks for valid escape sequences and builds the code points accordingly. If a broken sequence is detected, it triggers an error: + +```c +static const lxb_codepoint_t * escaped_to_codepoint(const lxb_char_t *data, ... +if (*state != 0) { + // Handle escape sequence state... + // Process each character to build the codepoint... +} +``` + +### Finalizing and Outputting + +After encoding, the program finalizes the encoded output and writes any remaining data to stdout. This is done using: + +```c +read_size = lxb_encoding_encode_buf_used(&encode); +if (read_size != 0) { + if (fwrite(outbuf, 1, read_size, stdout) != read_size) { + FAILED(false, "Failed to write data to stdout"); + } +} +``` + +This ensures that any data that has not yet been flushed from the buffer is written out before the program exits. + +## Conclusion + +The `encoder.c` file is a functional implementation of an encoding utility using the lexbor library. It effectively handles various character encodings, processes input data in a loop, and provides useful output, making it a useful tool for developers working with different text encodings. The awareness of error handling and usage guidance further enhances its usability in command-line environments. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/validate.md b/source/examples/encoding/buffer/encode/validate.md new file mode 100644 index 0000000..90b73d6 --- /dev/null +++ b/source/examples/encoding/buffer/encode/validate.md @@ -0,0 +1,43 @@ +# Unicode Encoding Example + +This article explains the functionality of a Unicode encoding example, which can be found in the source file `lexbor/encoding/buffer/encode/validate.c`. The code serves as an illustration of how to encode Unicode code points into a UTF-8 byte string using the Lexbor library. + +## Overview + +The example demonstrates the process of setting up an encoder, preparing a buffer for the encoded result, and ultimately encoding a series of Unicode code points. The code also highlights error handling when initializing the encoder and configuring it with replacement bytes for invalid code points. + +## Code Explanation + +### Includes and Macros + +The code begins by including necessary header files, specifically `string.h` for string manipulation and `lexbor/encoding/encoding.h` for encoding functions from the Lexbor library. A macro named `FAILED` is defined for error handling, which simplifies reporting errors by outputting a message to `stderr` and exiting the program with a failure status. + +### Main Function + +The `main` function encapsulates the entire encoding process. It starts by declaring variables that will be used later, including an `lxb_encoding_encode_t` structure to handle the encoding state, pointers to a list of code points, and a buffer initialized to hold the resulting UTF-8 byte string. + +### Code Points Preparation + +A set of Unicode code points is prepared in an array called `cps`, which includes valid points such as Cyrillic characters, a comma, a space, and an exclamation mark. Notably, one of the code points included is `0x110000`, which is invalid. This serves to demonstrate how replacement strategies can be applied when dealing with unexpected values. + +### Encoder Initialization + +The code subsequently retrieves the encoding data for UTF-8 using the `lxb_encoding_data` function. The encoder is initialized with `lxb_encoding_encode_init`, which requires the encoder structure, encoding data, a buffer, and the size of that buffer. If initialization fails, the program uses the `FAILED` macro to report the error and terminate. + +### Setting Replacement Bytes + +After successful initialization, the example configures the encoder to use specific replacement bytes for invalid code points by invoking `lxb_encoding_encode_replace_set`. This ensures that when an invalid code point is encountered during the encoding process, a predetermined sequence of bytes will replace it. + +### Encoding Process + +A message is printed to indicate the start of the encoding process. The actual encoding is performed using the `encode` function pointer from the encoding data, which takes the encoder structure and a range defined by pointers to the beginning and end of the code points. + +If the encoding state indicates an error, it will be silently ignored here since it should not occur in this example. After encoding, the buffer is appropriately terminated with a null byte to signify the end of the string. + +### Output + +Finally, the code loops through the original code points, printing each as a hexadecimal value to the console. It then outputs the resulting UTF-8 string stored in the buffer, demonstrating the successful encoding of the input code points. + +## Conclusion + +This example showcases how to utilize the Lexbor library to encode Unicode code points into a UTF-8 byte string while implementing error handling and customization through replacement bytes for invalid code points. By following the steps outlined, developers can efficiently manage Unicode data in their applications. \ No newline at end of file diff --git a/source/examples/encoding/buffer/from_to.md b/source/examples/encoding/buffer/from_to.md new file mode 100644 index 0000000..248dc8c --- /dev/null +++ b/source/examples/encoding/buffer/from_to.md @@ -0,0 +1,51 @@ +# Encoding Conversion Example + +This article describes an example of encoding conversion using the `from_to` program from the `lexbor` library, specifically found in the source file `lexbor/encoding/buffer/from_to.c`. The program reads data from the standard input, converts the data from one encoding to another (specified by the user), and outputs the result to the standard output. + +## Overview + +The main function of the program is to facilitate the conversion of text between various character encodings. This operation is critical in environments where data needs to be interpreted correctly across different platforms or applications that utilize specific character encoding schemes. The program checks the validity of input encodings, performs the decode and encode operations, and handles errors appropriately. + +### Major Components + +1. **Macro Definition for Error Handling** + A macro named `FAILED` is defined to centralize error handling within the program. It takes a flag (`with_usage`) to determine if usage instructions should be displayed, outputs an error message to `stderr`, and exits the program. This reduces redundancy in error handling and improves code maintainability. + + ```c + #define FAILED(with_usage, ...) \ + ``` + +2. **Usage Function** + The `usage` function prints out how to use the program along with available encoding names. If the required number of arguments is not provided (specifically two arguments for 'from' and 'to'), this function will be invoked to guide the user. + + ```c + static void usage(void) {...} + ``` + +3. **Main Function Logic** + The `main` function is where the primary execution occurs. It begins by checking command-line arguments to ensure the user has provided the necessary inputs. The program uses `lxb_encoding_data_by_pre_name` to retrieve encoding information based on user input, and if either input is invalid, it calls the `FAILED` macro. + +4. **Initialization of Encoder and Decoder** + Both the encoder and decoder are initialized with their respective encoding data. The decoder will convert input bytes into code points (abstract character representations), while the encoder converts these code points back into byte sequences of the target encoding. + + ```c + status = lxb_encoding_decode_init(&decode, from, cp, sizeof(cp) / sizeof(lxb_codepoint_t)); + ``` + +5. **Processing Input Data** + The program reads data from `stdin` in a loop until all input is processed. The decode operation converts the input byte sequence into code points, which are then passed to the encoder to convert into the target encoding. The `fwrite` function is employed to write the output to `stdout`. + + ```c + size = fread(inbuf, 1, sizeof(inbuf), stdin); + ``` + +6. **Finalization** + After all input has been processed, the program ensures that any remaining decoded data is encoded and written to the output. Special care is taken for the `iso-2022-jp` encoding, which may require specific handling to finalize the conversion. + + ```c + (void) lxb_encoding_encode_finish(&encode); + ``` + +## Conclusion + +The `from_to` example illustrates how to adeptly handle encoding conversions in C using the lexbor library. By providing a structured way to manage different encodings and offering clear error handling, this example serves as a foundational component in the development of applications that require text data manipulation across various encodings. The modular approach allows enhancements to be easily integrated, such as supporting additional encodings or modifying the input/output methods. \ No newline at end of file diff --git a/source/examples/encoding/data_by_name.md b/source/examples/encoding/data_by_name.md new file mode 100644 index 0000000..c89d3a4 --- /dev/null +++ b/source/examples/encoding/data_by_name.md @@ -0,0 +1,66 @@ +# Encoding Data Retrieval Example + +This article provides an explanation of an example from the file `lexbor/encoding/data_by_name.c`. The purpose of this code is to demonstrate how to retrieve encoding data by its name using the Lexbor encoding library. The code illustrated here highlights the procedure for accessing character encoding information, specifically focusing on UTF-8. + +## Code Explanation + +The program starts with the necessary `#include` directive, which includes the Lexbor encoding library header file. This library provides the functionality needed to work with different character encodings. + +### Main Function + +The `main` function serves as the entry point of the program: + +```c +int main(int argc, const char *argv[]) +``` + +Here, it accepts two parameters: the argument count `argc` and an array of argument strings `argv`. Although the parameters are not utilized in this example, they are typically included for potential command-line functionality. + +### Retrieving Encoding Data + +The key operation occurs in the following block: + +```c +const lxb_encoding_data_t *enc_data; +enc_data = lxb_encoding_data_by_name((lxb_char_t *) "uTf-8", 5); +``` + +In this segment, the variable `enc_data` is declared as a pointer to `lxb_encoding_data_t`, which represents the encoding data structure in Lexbor. The function `lxb_encoding_data_by_name` is called with two arguments: the string "uTf-8" (with a deliberate mixed case) and the length of the string, which is `5`. + +This function attempts to retrieve encoding data corresponding to the specified name. If the name provided does not match any available encoding in the library, the function will return `NULL`. + +### Error Handling + +The next block of code provides basic error handling: + +```c +if (enc_data == NULL) { + return EXIT_FAILURE; +} +``` + +If `enc_data` is `NULL`, the program terminates with a failure status. This is an important check to ensure that the encoding has been found before attempting to access any of its properties, thus preventing potential runtime errors. + +### Output Encoding Name + +Upon successful retrieval of the encoding data, the program proceeds to print the name of the encoding: + +```c +printf("%s\n", enc_data->name); +``` + +This line outputs the name of the encoding that has been retrieved, which in this case would be "UTF-8", assuming the spelling was correct in the function call. + +### Exit Status + +Finally, the program completes its execution successfully: + +```c +return EXIT_SUCCESS; +``` + +This line returns a success status to the operating system, indicating that the program has run without any issues. + +## Conclusion + +The example presented in `lexbor/encoding/data_by_name.c` effectively demonstrates how to access encoding data using the Lexbor encoding library. It showcases the importance of error handling and provides a simple way to retrieve and display the name of a character encoding, using UTF-8 as a practical example. This code can serve as a foundational component for applications that require encoding information for text processing. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/decode.md b/source/examples/encoding/single/decode/decode.md new file mode 100644 index 0000000..294ec3e --- /dev/null +++ b/source/examples/encoding/single/decode/decode.md @@ -0,0 +1,122 @@ +# UTF-8 Decoding Example + +This article explains a code example from `lexbor/encoding/single/decode/decode.c`, which demonstrates how to decode a UTF-8 string into its respective code points using the lexbor library. + +## Introduction + +The primary purpose of this code is to decode a UTF-8 encoded string, specifically the phrase "Привет, мир!" (which means "Hello, world!" in Russian), into individual Unicode code points. It showcases the initialization of the decoder, the processing of the input string, and outputting the results in a formatted manner. + +## Code Explanation + +### Include the Required Header + +The necessary header file is included at the beginning of the code: + +```c +#include +``` + +This header provides the necessary declarations for working with encoding functionalities offered by lexbor. + +### Error Handling Macro + +The code defines a macro for error handling: + +```c +#define FAILED(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + exit(EXIT_FAILURE); \ + } \ + while (0) +``` + +This macro outputs an error message to the standard error stream and exits the program if a failure condition is met. It streamlines error handling throughout the code. + +### Main Function + +The `main` function serves as the entry point of the program: + +```c +int main(int argc, const char *argv[]) +{ + ... +} +``` + +### Variable Declarations + +Several variables are declared to handle the decoding process, including: + +- `lxb_codepoint_t cp;`: Stores the current code point. +- `lxb_status_t status;`: Holds the status of operations. +- `lxb_encoding_decode_t decode;`: The decoder instance. +- `const lxb_encoding_data_t *encoding;`: Pointer to the encoding data. +- `const lxb_char_t *pos;`: Pointer to track the current position in the input data. + +### Preparing the Input Buffer + +The input UTF-8 string is initialized, along with a pointer to the end of the string: + +```c +const lxb_char_t *data = (const lxb_char_t *) "Привет, мир!"; +const lxb_char_t *end = data + strlen((char *) data); +``` + +The `strlen` function determines the length of the string to establish the end of the data. + +### Setting Up the Encoding + +The program retrieves UTF-8 encoding data with: + +```c +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); +``` + +This function sets up the necessary encoding data for subsequent decoding operations. + +### Initializing the Decoder + +The decoder is initialized with: + +```c +status = lxb_encoding_decode_init_single(&decode, encoding); +if (status != LXB_STATUS_OK) { + FAILED("Failed to init decoder"); +} +``` + +If the initialization fails, the program invokes the `FAILED` macro to print the error and exit. + +### Decoding Loop + +Following initialization, the program enters a loop to decode each character in the input string: + +```c +while (data < end) { + ... +} +``` + +Inside the loop, the current position (`pos`) is recorded, and the decoding function is called: + +```c +cp = encoding->decode_single(&decode, &data, end); +``` + +This line decodes a single UTF-8 character, advancing the input pointer `data` as needed. The result is checked against a maximum allowable code point value, although in this example, that condition is expected never to occur. + +### Outputting the Results + +For each decoded character, the code prints the results to the standard output: + +```c +printf("%.*s: 0x%04X\n", (int) (data - pos), pos, cp); +``` + +This formatted output provides both the original UTF-8 character (as a substring) and its corresponding Unicode code point in hexadecimal format. + +## Conclusion + +The example demonstrates a straightforward approach to decoding a UTF-8 string into Unicode code points using the lexbor library. It effectively showcases initialization, error handling, and character decoding, providing a practical illustration of working with character encodings in C. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/decoder.md b/source/examples/encoding/single/decode/decoder.md new file mode 100644 index 0000000..7a039ed --- /dev/null +++ b/source/examples/encoding/single/decode/decoder.md @@ -0,0 +1,148 @@ +# Encoding Decoder Example + +In this article, we will explore the encoding decoder example found in the file `lexbor/encoding/single/decode/decoder.c`. This code demonstrates how to decode input data from standard input according to a specified character encoding. It provides a useful utility for developers needing to handle various text encodings in their applications. + +## Code Overview + +The main function of this code is to read data from standard input, decode it according to the specified encoding, and print the corresponding Unicode values. It uses the Lexbor library to facilitate this process. + +### Header and Includes + +At the beginning of the file, we find the licensing information and the inclusion of the Lexbor encoding header: + +```c +#include +``` + +This inclusion allows access to functions and definitions related to text encoding and decoding. + +### Error Handling Macro + +A macro named `FAILED` is defined to streamline error management: + +```c +#define FAILED(with_usage, ...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + if (with_usage) { \ + usage(); \ + } \ + \ + exit(EXIT_FAILURE); \ + } \ + while (0) +``` + +This macro takes a condition (`with_usage`) and, upon failure, prints an error message to standard error, optionally displays usage instructions, and exits the program with a failure status. This convenient encapsulation enhances code readability and maintainability. + +### Usage Function + +Next, the `usage` function is defined: + +```c +static void usage(void) +{ + printf("Usage: decoder \n\n"); + printf("Available encodings:\n"); + ... +} +``` + +This function provides users with information about how to use the decoder program and lists the available character encodings that can be specified as command-line arguments. + +### Main Function Structure + +The `main` function begins with some variable declarations: + +```c +int main(int argc, const char *argv[]) +{ + size_t read_size; + lxb_status_t status; + lxb_codepoint_t cp = 0x0000; + lxb_encoding_decode_t decode; + const lxb_encoding_data_t *encoding; +``` + +**Variable Description:** +- `read_size`: To store the number of bytes read from standard input. +- `status`: To capture the success or failure of encoding operations. +- `cp`: A variable representing the code point being processed. +- `decode`: A structure for managing the decoding state. +- `encoding`: A pointer to the encoding data determined by user input. + +#### Input Validation + +The program first checks for the correct number of command-line arguments: + +```c +if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); +} +``` + +If no encoding is specified, it invokes the `usage` function and exits gracefully. + +#### Encoding Detection + +Next, the program attempts to identify the desired encoding based on the provided name: + +```c +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], + strlen(argv[1])); +if (encoding == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n\n", argv[1]); +} +``` + +If the specified encoding is not recognized, it triggers the `FAILED` macro, providing feedback to the user. + +#### Decoder Initialization + +The decoder is then initialized: + +```c +status = lxb_encoding_decode_init_single(&decode, encoding); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to init decoder"); +} +``` + +This step configures the decoder to use the chosen encoding. If the initialization fails, the program prints an error and exits. + +### Data Reading and Decoding Loop + +The program enters a loop to read from standard input: + +```c +do { + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + ... + while (data < end) { + cp = encoding->decode_single(&decode, &data, end); + ... + } +} while (loop); +``` + +Within this loop: +- Data is read into a buffer (`inbuf`). +- Each code point is decoded using the `decode_single` method. +- Based on the value of `cp`, different output formats are printed for Unicode and ASCII characters. + +### Output and Continuation + +Finally, the program checks if the decoding process requires continuation, outputting a replacement character where necessary: + +```c +if (cp == LXB_ENCODING_DECODE_CONTINUE) { + printf("\\u%04X", LXB_ENCODING_REPLACEMENT_CODEPOINT); +} +``` + +### Conclusion + +By effectively using the Lexbor library's encoding functionalities, this code provides a flexible and powerful example of how to decode various text encodings from standard input. Developers can adapt this example for their applications, thereby enhancing their ability to handle encoded text data efficiently. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/validate.md b/source/examples/encoding/single/decode/validate.md new file mode 100644 index 0000000..76afd1e --- /dev/null +++ b/source/examples/encoding/single/decode/validate.md @@ -0,0 +1,65 @@ +# UTF-8 Decoding and Validation Example + +This article explains an example of decoding and validating a UTF-8 string, using the Lexbor library. The source file for this code example is `lexbor/encoding/single/decode/validate.c`. The primary objective of this code is to demonstrate how to properly decode a UTF-8 encoded string, handle decoding errors, and output both valid code points and error information for invalid byte sequences. + +## Code Breakdown + +The example begins with necessary includes and macro definitions. It imports the required header file for Lexbor encoding and defines a macro `FAILED` that handles error reporting and terminates the program if an error occurs. + +### Setting Up the Main Function + +The `main` function initializes variables needed for decoding. Here, `lxb_status_t status`, `lxb_codepoint_t cp`, and `lxb_encoding_decode_t decode` are declared. Additionally, a pointer to encoding data will be initialized as the UTF-8 encoding. + +```c +lxb_status_t status; +lxb_codepoint_t cp; +lxb_encoding_decode_t decode; +const lxb_encoding_data_t *encoding; +``` + +### Preparing the Data Buffer + +The code prepares a buffer containing the string "Привет,\x80 мир!". The string contains a valid UTF-8 sequence followed by an invalid byte sequence (0x80). The end of the buffer is determined using `strlen` to ensure the decoding process will iterate through the entire string. + +```c +const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; +const lxb_char_t *end = data + strlen((char *) data); +``` + +### Initializing the Decoder + +The encoding is initialized with `lxb_encoding_data(LXB_ENCODING_UTF_8)`, and the decoder is set up using the function `lxb_encoding_decode_init_single`. If initialization fails, the `FAILED` macro reports the error and exits the program. + +```c +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); +status = lxb_encoding_decode_init_single(&decode, encoding); +if (status != LXB_STATUS_OK) { + FAILED("Failed to init decoder"); +} +``` + +### Decoding Process + +The core loop of the example begins, where the program continuously decodes until the end of the data buffer is reached. Each iteration decodes a single code point from the UTF-8 data. + +```c +while (data < end) { + pos = data; + cp = encoding->decode_single(&decode, &data, end); +} +``` + +If a valid code point is within the acceptable range defined by `LXB_ENCODING_DECODE_MAX_CODEPOINT`, it gets printed together with the decoded UTF-8 sequence. If an invalid byte sequence is encountered that exceeds the maximum code point, it prints an error message indicating the bad byte sequences. + +```c +if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + printf("Bad byte sequences: 0x%04X; Replaced to: 0x%04X ('%s')\n", + *pos, LXB_ENCODING_REPLACEMENT_CODEPOINT, + LXB_ENCODING_REPLACEMENT_BYTES); + continue; +} +``` + +### Conclusion + +The program concludes by returning a success status if all decoding operations complete without errors. In summary, this code serves as an illustrative example of how to utilize the Lexbor encoding library to decode and validate UTF-8 encoded strings effectively, while properly handling potential errors in byte sequences. By implementing this method, developers can ensure their applications correctly interpret and display UTF-8 content. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encode.md b/source/examples/encoding/single/encode/encode.md new file mode 100644 index 0000000..5d8d712 --- /dev/null +++ b/source/examples/encoding/single/encode/encode.md @@ -0,0 +1,36 @@ +# UTF-8 Encoding Example + +This article explains the purpose and functionality of the UTF-8 encoding example provided in the file `lexbor/encoding/single/encode/encode.c`. The code demonstrates how to encode a series of Unicode code points into a UTF-8 byte string using the Lexbor encoding library. + +## Code Overview + +The program begins by including the necessary header file for the Lexbor encoding library. It defines a macro for error handling named `FAILED`, which simplifies printing error messages and terminating the program if initialization or execution fails. + +### Main Function Structure + +The `main` function serves as the entry point of the program. It declares several variables needed for encoding, including a buffer for the output and an encoder instance. The following key steps are involved in the encoding process: + +1. **Buffer Preparation**: + A buffer of 1024 bytes is allocated to hold the UTF-8 encoded string. The variables `data` and `end` are set to track the start and the end of the buffer. + +2. **Unicode Code Points**: + An array of Unicode code points is defined and terminated with a zero. These code points (e.g., Cyrillic characters for "Привет, мир!") are the values that will be encoded. + +3. **Encoding Initialization**: + The function `lxb_encoding_data` retrieves the encoding data for UTF-8, which is passed to `lxb_encoding_encode_init_single` to initialize the encoder. If the initialization fails, the `FAILED` macro is invoked to handle the error. + +4. **Encoding Loop**: + The program enters a loop where each code point is processed for encoding: + - The current position in the buffer (`pos`) is saved. + - The encoder's `encode_single` function is called to perform the encoding. The length of the encoded output is returned. + - If the encoding operation is successful, the resulting UTF-8 bytes are printed alongside their corresponding Unicode code point in hexadecimal format. + +5. **String Termination**: + After processing all code points, the buffer is null-terminated to ensure it is properly formatted as a C string. + +6. **Output Display**: + Finally, the UTF-8 encoded string is printed to the console, demonstrating the successful encoding of the provided Unicode code points. + +## Conclusion + +Upon reaching the end of the program, it exits gracefully, indicating successful execution. This example illustrates how to use the Lexbor encoding library for converting Unicode code points to a UTF-8 encoded string, providing a clear and practical implementation of encoding functionality in C using Lexbor. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encoder.md b/source/examples/encoding/single/encode/encoder.md new file mode 100644 index 0000000..e43191d --- /dev/null +++ b/source/examples/encoding/single/encode/encoder.md @@ -0,0 +1,70 @@ +# Encoding Input Data Example + +This article explains the purpose and functionality of the `encoder.c` source file located in the `lexbor/encoding/single/encode` directory. The code provides a utility for encoding text input based on a specified character encoding scheme. It reads data from standard input (stdin), decodes any escaped code points in the input, and encodes the results according to the selected encoding. + +## Key Components + +### Header and Macros + +The file begins with some header information including copyright and the author's details. Following this, necessary includes and definitions are placed. The macro `FAILED` is defined to handle error reporting and exit when a critical failure occurs. This block of code succinctly prints an error message, displays usage instructions if required, and terminates the program: + +```c +#define FAILED(with_usage, ...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + if (with_usage) { \ + usage(); \ + } \ + exit(EXIT_FAILURE); \ + } \ + while (0) +``` + +### Usage Function + +The `usage` function outputs the required command-line usage for the program, listing all of the available encodings such as `UTF-8`, `ISO-8859-1`, and `SHIFT-JIS`. This function is invoked if the user does not supply the required arguments. + +```c +static void usage(void) { + printf("Usage: encoder \n\n"); + // List of available encodings... +} +``` + +### Escaped Code Point Conversion + +The function `escaped_to_codepoint` is responsible for converting escaped Unicode sequences to their corresponding code points. The function processes the input data character by character, identifying whether the sequence starts with a backslash, and checking for either hexadecimal (`\x`) or Unicode (`\u`) formats. If an incorrectly formatted escape sequence is detected, an error state is triggered prompting the program to exit: + +```c +static const lxb_char_t *escaped_to_codepoint(const lxb_char_t *data, const lxb_char_t *end, + lxb_codepoint_t *cp, int8_t *state) +{ + // Processing logic... + if (*data != '\\') { + goto failed; // Handle invalid start of escape sequence + } + // More processing... +} +``` + +### Main Functionality + +The `main` function orchestrates the entire encoding process: + +1. **Argument Handling**: It requires one argument indicating the desired encoding. +2. **Encoding Setup**: It retrieves the encoding configuration using the provided argument and initializes the encoder. +3. **Input Loop**: The program enters a loop where it reads input data from stdin, processes it into code points, and then encodes these points: + + ```c + while (data < end) { + data = escaped_to_codepoint(data, end, &cp, &state); + // Encoding logic... + } + ``` + +4. **Output Handling**: The encoded output is written to stdout. If the encoding is `UTF-8`, replacement bytes are used as necessary. + +Overall, the program is designed to robustly handle input encoding, managing possible errors during reading and writing, and validating formats. The use of the `lexbor` library enables effective encoding management, providing a variety of supported character encodings. + +In conclusion, the `encoder.c` file serves as a practical example of encoding conversion using a command-line utility, highlighting important coding principles, such as error handling, input/output operations, and state management within the context of encoding mechanisms. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/validate.md b/source/examples/encoding/single/encode/validate.md new file mode 100644 index 0000000..9ebfa1a --- /dev/null +++ b/source/examples/encoding/single/encode/validate.md @@ -0,0 +1,46 @@ +# Encoding Unicode Code Points to UTF-8 Example + +This example demonstrates how to validate and encode Unicode code points into a UTF-8 byte string using the lexbor library. The functionality is encapsulated within a C program located in the `lexbor/encoding/single/encode/validate.c` file. The purpose of this code is to illustrate the encoding of a set of given code points, handling exceptions for those that are invalid by replacing them with a predefined replacement character. + +## Overview of the Code + +The code begins by including the necessary header files from the lexbor library, specifically targeting encoding functionality. It subsequently defines a macro for error handling, which outputs an error message to `stderr` and exits the program with a failure status. + +### Variable Declarations + +The `main` function sets up various variables needed for the encoding process: + +- `len`: This variable stores the length of the encoded string. +- `status`: Utilized for capturing the status of encoding operations. +- `encode`: An instance of `lxb_encoding_encode_t`, used to manage encoding context. +- `encoding`: A pointer to the appropriate encoding data. +- `pos`: A pointer that tracks the current position in the output buffer. + +### Buffer Preparation + +A buffer (`buffer`) of 1024 `lxb_char_t` elements is defined to hold the resulting UTF-8 byte string. Pointers are initialized to manage the writing process into this buffer safely. + +### Unicode Code Points + +An array of Unicode code points is declared, which includes both valid and an intentionally invalid code point (`0x110000`). This is to illustrate how the code handles bad input during encoding. + +### Encoding Initialization + +The code retrieves the UTF-8 encoding data using `lxb_encoding_data(LXB_ENCODING_UTF_8)` and initializes the encoding context with `lxb_encoding_encode_init_single(&encode, encoding)`. If this initialization fails, an error message is reported, and the program exits. + +### Encoding Loop + +The core functionality is encapsulated in a loop that processes each code point from the `cps` array: + +1. **Position Tracking**: The position pointer `pos` is reset to the current data pointer at the start of the loop iteration. +2. **Encoding**: Each code point is encoded using the `encode_single` method. The returned `len` represents the number of bytes written to the buffer. +3. **Error Handling**: If `len` indicates a problem (less than `LXB_ENCODING_ENCODE_OK`), the code checks for buffer size issues (though this example does not expect to encounter this). If the code point is invalid, it prints an error message along with a replacement character output, handling the invalid code point scenario gracefully. +4. **Output**: For valid code points, the program prints the code point and its corresponding UTF-8 representation. + +### Finalization + +After processing all code points, the program terminates the string by setting the last byte of the buffer to `0x00`. It then prints the final UTF-8 result. + +## Conclusion + +The program effectively showcases how to handle Unicode encoding with proper error management for invalid inputs. This example is particularly useful for developers using the lexbor library to manage character encodings, providing insight on validating and encoding procedures in C. \ No newline at end of file diff --git a/source/examples/encoding/single/from_to.md b/source/examples/encoding/single/from_to.md new file mode 100644 index 0000000..5e120af --- /dev/null +++ b/source/examples/encoding/single/from_to.md @@ -0,0 +1,99 @@ +# Encoding Conversion Example + +This article explains the encoding conversion functionality provided in the source file `lexbor/encoding/single/from_to.c`. The code allows users to convert text from one character encoding to another via command-line input. It demonstrates how to utilize the Lexbor encoding library for encoding and decoding different formats of character sets. + +## Overview + +The main function in this code receives two command-line arguments representing the source (`from`) and target (`to`) encodings. It reads input data from standard input, decodes it from the specified `from` encoding to Unicode code points, and then encodes those code points into the specified `to` encoding before writing the output to standard output. + +## Code Breakdown + +### Definitions and Includes + +At the beginning of the file, we include the necessary header for the Lexbor encoding module: + +```c +#include +``` + +This allows us access to various functions and types defined in the library, which facilitate character encoding tasks. + +### Failure Handling Macro + +The `FAILED` macro is defined for error handling throughout the code: + +```c +#define FAILED(with_usage, ...) ... +``` + +This macro simplifies error reporting by printing error messages to standard error and conditionally calling the `usage` function to display usage instructions before terminating the program. Adopting this macro ensures a consistent approach to error handling across the code. + +### Usage Function + +The `usage` function provides instructions on how to use the encoding conversion tool: + +```c +static void usage(void) { ... } +``` + +It lists the accepted input encodings that users can specify when executing the program. This function is crucial for user guidance, ensuring that they know the correct format for command inputs. + +### Main Function + +The `main` function orchestrates the overall process: + +```c +int main(int argc, const char *argv[]) { ... } +``` + +1. **Argument Count Check**: The function starts by checking if the user provided exactly two arguments (the source and target encodings). If not, the `usage` function is called, and the program exits. + +2. **Encoding Data Retrieval**: The code fetches the encoding information for both the source and target encodings using the `lxb_encoding_data_by_pre_name` function: + + ```c + from = lxb_encoding_data_by_pre_name(...); + to = lxb_encoding_data_by_pre_name(...); + ``` + + If either retrieval fails, the `FAILED` macro is triggered, stopping execution. + +3. **Initialization of Encoder and Decoder**: The encoder and decoder are initialized with the retrieved encoding data: + + ```c + status = lxb_encoding_encode_init_single(&encode, to); + status = lxb_encoding_decode_init_single(&decode, from); + ``` + + These initializations set up the necessary state for encoding and decoding operations. + +### Input Reading and Processing Loop + +The program enters a loop where it continuously reads from standard input until EOF (End Of File) is reached: + +```c +do { + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + ... +} while (loop); +``` + +Within the loop: + +- The fetched data is decoded using the `from` encoder to obtain Unicode code points. + +- For each code point decoded, it is then encoded with the `to` encoder and written to standard output. + +### Finalization + +After processing all input data, the code finalizes the decoder and encoder: + +```c +status = lxb_encoding_decode_finish_single(&decode); +len = lxb_encoding_encode_finish_single(&encode, &out, out_end); +``` + +These finalization steps ensure that any remaining data is processed and that resources are cleaned up properly before the program exits. + +## Conclusion + +The `from_to.c` example illustrates a practical approach to character encoding conversion using the Lexbor encoding library. It showcases error handling, user guidance, and processing loops, making it a valuable reference for developers needing to handle various text encodings in their applications. This example emphasizes the importance of robust input handling and clean output generation within character encoding operations. \ No newline at end of file diff --git a/source/examples/html/document_parse.md b/source/examples/html/document_parse.md new file mode 100644 index 0000000..6388648 --- /dev/null +++ b/source/examples/html/document_parse.md @@ -0,0 +1,90 @@ +# HTML Document Parsing Example + +This article explains an example of parsing an HTML document using the Lexbor library. The purpose of this example, located in the source file `lexbor/html/document_parse.c`, is to illustrate the steps necessary to create an HTML document, parse a string of HTML, and serialize the resulting DOM tree. + +## Example Overview + +The example demonstrates the following key steps: + +1. **Creating the HTML Document**: Initializing a new HTML document. +2. **Parsing the HTML**: Taking an HTML string and processing it to generate a DOM tree. +3. **Outputting the Results**: Printing the original HTML and the resulting DOM structure. +4. **Cleaning Up**: Destroying the document to free allocated resources. + +## Code Explanation + +### Main Function + +The program starts in the `main` function, where it declares a variable for the document status and a pointer to the HTML document. + +```c +lxb_status_t status; +lxb_html_document_t *document; +``` + +### Defining HTML Content + +A static character array contains the HTML to be parsed. The length of this HTML string is also calculated. + +```c +static const lxb_char_t html[] = "

blah-blah-blah

"; +size_t html_len = sizeof(html) - 1; +``` + +### Document Initialization + +The next segment involves initializing a new HTML document using the `lxb_html_document_create` function. This function allocates necessary memory and sets up internal structures to hold the document data. + +```c +document = lxb_html_document_create(); +if (document == NULL) { + FAILED("Failed to create HTML Document"); +} +``` + +If the document creation fails, an error message is printed, allowing for debugging. + +### HTML Parsing + +Once the document is created, the program parses the HTML content. The `lxb_html_document_parse` function is responsible for parsing the input HTML string. + +```c +status = lxb_html_document_parse(document, html, html_len); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` + +If the status indicates a failure, an appropriate message is shown. This rigorous checking ensures that errors during parsing do not go unnoticed. + +### Output the Results + +After successfully parsing the HTML, the program prints the original HTML string and serializes the resulting DOM tree. The `PRINT` macro is used for outputting the HTML content. + +```c +PRINT("HTML:"); +PRINT("%s", (const char *) html); +``` + +It then calls a serialization function to visualize the structure of the parsed HTML document: + +```c +PRINT("\nHTML Tree:"); +serialize(lxb_dom_interface_node(document)); +``` + +This step helps developers understand how the HTML input is translated into a DOM tree structure, which is crucial for many web development tasks. + +### Document Cleanup + +Finally, the program cleans up by destroying the HTML document to avoid memory leaks. This is done using the `lxb_html_document_destroy` function: + +```c +lxb_html_document_destroy(document); +``` + +Ensuring proper resource management is important in C programming, as it helps maintain system performance and stability. + +## Conclusion + +The example provided in `lexbor/html/document_parse.c` serves as a clear demonstration of how to create, parse, and handle an HTML document using Lexbor. Through careful initialization, parsing, result outputting, and cleanup, this code illustrates best practices for managing HTML documents in a C environment. \ No newline at end of file diff --git a/source/examples/html/document_parse_chunk.md b/source/examples/html/document_parse_chunk.md new file mode 100644 index 0000000..7298f09 --- /dev/null +++ b/source/examples/html/document_parse_chunk.md @@ -0,0 +1,92 @@ +# HTML Document Parsing Example + +This article provides an overview of an example implementation of HTML document parsing using the Lexbor library. The example is located in the source file `lexbor/html/document_parse_chunk.c`. This example demonstrates how to create an HTML document, parse it in chunks, and handle the cleaning up of allocated resources. + +## Code Overview + +The primary function of the code is to illustrate how to process HTML content in segments, allowing for a more flexible parsing technique suitable for scenarios where full documents may not be available in one piece. This chunk-based parsing can be particularly useful for streaming applications or when handling very large HTML documents. + +### Initialization + +At the beginning of the `main` function, several essential variables are declared, including a status variable of type `lxb_status_t` and a pointer to a `lxb_html_document_t`, which will represent our HTML document. + +```c +lxb_html_document_t *document; +``` + +The `lxb_html_document_create()` function is called to create an instance of the HTML document. It is essential to check whether the document was created successfully. + +```c +document = lxb_html_document_create(); +if (document == NULL) { + FAILED("Failed to create HTML Document"); +} +``` + +If the document creation fails, the program will exit, indicating an error. + +### Parsing HTML Chunks + +The HTML content is stored in a two-dimensional array of characters. Each string represents a fragment of the HTML document. The fragments are designed to be combined later to form a complete HTML structure. + +```c +static const lxb_char_t html[][64] = { + "", "", "HTML chun", + "ks parsing", "
", "good for me", "
", "\0" +}; +``` + +After setting up the document, the code initiates the parsing process by calling `lxb_html_document_parse_chunk_begin()`, which prepares the document to accept incoming chunks of HTML. + +```c +status = lxb_html_document_parse_chunk_begin(document); +``` + +The program then enters a loop that iterates over each HTML chunk until it reaches a null-terminating character. For each chunk, it prints the chunk content and attempts to parse it using `lxb_html_document_parse_chunk()`. This function takes the current HTML chunk and its length as input, returning a status that indicates success or failure. + +```c +for (size_t i = 0; html[i][0] != '\0'; i++) { + PRINT("%s", (const char *) html[i]); + + status = lxb_html_document_parse_chunk(document, html[i], + strlen((const char *) html[i])); + if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML chunk"); + } +} +``` + +If any chunk fails to parse correctly, the program will exit with an error message. + +### Finalization + +After processing all HTML chunks, the end of the parsing process is signaled with the call to `lxb_html_document_parse_chunk_end()`. This function finalizes the parsing operation and validates the final structure of the document. + +```c +status = lxb_html_document_parse_chunk_end(document); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` + +### Printing Results + +Once parsing is complete, the example demonstrates how to serialize the resulting HTML DOM tree using the `serialize()` function, allowing the user to see the structured representation of the parsed HTML content. + +```c +PRINT("\nHTML Tree:"); +serialize(lxb_dom_interface_node(document)); +``` + +### Cleanup + +Finally, the document is destroyed using `lxb_html_document_destroy()`, which frees the allocated memory associated with the HTML document instance. This resource management step is crucial in avoiding memory leaks. + +```c +lxb_html_document_destroy(document); +``` + +## Conclusion + +This example effectively illustrates how to use Lexbor for HTML document parsing in a chunked manner. The structure and logic of the code provide a solid foundation for more advanced HTML processing applications. It encapsulates essential operations such as initialization, incremental parsing, result extraction, and cleanup in a clear and easy-to-follow manner. \ No newline at end of file diff --git a/source/examples/html/document_title.md b/source/examples/html/document_title.md new file mode 100644 index 0000000..5952f52 --- /dev/null +++ b/source/examples/html/document_title.md @@ -0,0 +1,89 @@ +# HTML Document Title Example + +This article will explain the functionality of the HTML document title example implemented in the source code found in `lexbor/html/document_title.c`. The purpose of this code is to demonstrate how to parse an HTML string, retrieve its title, modify the title, and then display the resulting HTML document structure using the Lexbor library. + +## Code Breakdown + +### Initialization + +The code begins with the inclusion of the required headers and the setup of the `main` function, which is the entry point of the program. Here, the main task involves creating an HTML document instance and specifying the necessary variables. + +```c +lxb_html_document_t *document; +``` +This line declares a pointer to an `lxb_html_document_t` structure which represents the HTML document being created. The succeeding lines define variables for storing the title and its length. + +### Creating the Document + +The next significant step is the initialization of the HTML document: + +```c +document = lxb_html_document_create(); +if (document == NULL) { + FAILED("Failed to create HTML Document"); +} +``` +In this snippet, the `lxb_html_document_create` function is called to allocate memory for a new HTML document. If the document fails to create, the program invokes the `FAILED` macro to signal an error. + +### Parsing HTML + +After successfully creating the document, the code proceeds to parse the HTML string: + +```c +status = lxb_html_document_parse(document, html, html_len); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` +Here, the HTML content defined in the `html` array—specifically the title tag which contains extra spaces—is parsed. The variable `status` checks if the operation was successful. If not, the program exits with an error message. + +### Retrieving the Title + +Once the document is parsed, the code retrieves the title of the document: + +```c +title = lxb_html_document_title(document, &title_len); +``` +This function call extracts the title text from the document, storing it into the `title` variable. The length of the title is also provided through the `title_len` reference. The subsequent `if` statement checks whether the title exists, printing the title or an empty message accordingly. + +### Obtaining the Raw Title + +The following code retrieves the raw title, which includes the original formatting (e.g., extra spaces): + +```c +title = lxb_html_document_title_raw(document, &title_len); +``` +Much like the previous title retrieval, this extracts the unformatted title, allowing a comparison between the cleaned and raw titles. + +### Modifying the Title + +The code then demonstrates how to change the document's title: + +```c +status = lxb_html_document_title_set(document, new_title, new_title_len); +if (status != LXB_STATUS_OK) { + FAILED("Failed to change HTML title"); +} +``` +By invoking `lxb_html_document_title_set`, the title is altered to a new value defined by the `new_title` variable. An error check follows to ensure the title change was successful. + +### Displaying the New Title and HTML Structure + +The final steps involve displaying the updated title and the entire HTML document structure after modification: + +```c +title = lxb_html_document_title(document, &title_len); +``` +This repeats the earlier title retrieval process to print the new title. Finally, the code prints the altered HTML structure to show the impact of the title change. + +### Cleanup + +Lastly, the document is destroyed to free the allocated memory, which is crucial for preventing memory leaks: + +```c +lxb_html_document_destroy(document); +``` + +## Conclusion + +This example illustrates the basic operations for handling HTML document titles using the Lexbor library, including parsing content, accessing and modifying the title, and ensuring proper resource management. The structure of the code is straightforward, aiming to provide a clear understanding of each step involved in managing an HTML document's title. As developers familiarize themselves with the functionalities offered by Lexbor, they will be better equipped to manipulate HTML content programmatically. \ No newline at end of file diff --git a/source/examples/html/element_attributes.md b/source/examples/html/element_attributes.md new file mode 100644 index 0000000..0ef536a --- /dev/null +++ b/source/examples/html/element_attributes.md @@ -0,0 +1,120 @@ +# Element Attributes Example + +This article explains the implementation found in `lexbor/html/element_attributes.c`, which demonstrates how to manipulate HTML element attributes using the Lexbor library. The example outlines parsing an HTML snippet, finding an element, and performing various operations involving element attributes, such as adding, checking existence, retrieving, modifying, and removing attributes from an element. + +## Code Overview + +The code begins by including necessary headers and defining the main function, which initializes variables for handling the document and its components. The use of `lxb_status_t` for tracking the status of operations is essential throughout the code. + +### HTML Parsing + +The code defines a static HTML string: + +```c +static const lxb_char_t html[] = "
"; +``` + +A document is parsed from this HTML string with: + +```c +document = parse(html, html_len); +``` + +After parsing, the code outputs the structure of the DOM tree to the console using a `serialize` function, allowing developers to visualize the parsed HTML elements. + +### Collection Creation + +Next, a DOM collection is created to hold references to found elements: + +```c +collection = lxb_dom_collection_make(&document->dom_document, 16); +``` + +If the collection creation fails, an error message is printed, and the program exits. + +### Searching for Elements + +To find the `
` element in the DOM, the code first obtains the body element and then calls: + +```c +status = lxb_dom_elements_by_tag_name(element, collection, (const lxb_char_t *) "div", 3); +``` + +This line searches for all `
` elements under the specified parent element. A check for successful status and the collection's length follows, ensuring that at least one `
` is found. + +### Adding an Attribute + +Once the element is identified, a new attribute is added using: + +```c +attr = lxb_dom_element_set_attribute(element, name, name_size, (const lxb_char_t *) "oh God", 6); +``` + +In this case, the attribute named "my-name" is appended with a value of "oh God." If the attribute creation fails, an error message is displayed. + +### Checking Attribute Existence + +The program checks if the newly added attribute exists: + +```c +is_exist = lxb_dom_element_has_attribute(element, name, name_size); +``` + +A printed message confirms whether the attribute is present or not based on the check. + +### Retrieving Attribute Values + +The next operation retrieves the value of the specified attribute: + +```c +value = lxb_dom_element_get_attribute(element, name, name_size, &value_len); +``` + +If successful, it prints the value associated with the "my-name" attribute. + +### Iterating Through Attributes + +The code then demonstrates how to iterate through all attributes of the element: + +```c +attr = lxb_dom_element_first_attribute(element); +``` + +This iterates through attributes using a `while` loop, printing each attribute's name and value until there are no more attributes in the collection. + +### Modifying an Attribute Value + +To change the value of an existing attribute, the code retrieves the attribute by name: + +```c +attr = lxb_dom_element_attr_by_name(element, name, name_size); +``` + +Then, it updates the value to "new value" using: + +```c +status = lxb_dom_attr_set_value(attr, (const lxb_char_t *) "new value", 9); +``` + +### Removing an Attribute + +Finally, the example concludes with the removal of the newly added attribute: + +```c +lxb_dom_element_remove_attribute(element, name, name_size); +``` + +This operation is followed by a serialized output of the DOM tree again, allowing the developer to observe changes. + +### Cleanup + +The code ensures proper resource management by destroying the collection and the document at the end of the main function to prevent memory leaks: + +```c +lxb_dom_collection_destroy(collection, true); +lxb_html_document_destroy(document); +``` + +## Conclusion + +The `element_attributes.c` example illustrates fundamental operations in DOM manipulation provided by the Lexbor library. The code efficiently demonstrates how to parse HTML, locate and manipulate elements, manage attributes, and ensure appropriate cleanup of resources, making it a valuable reference for web developers working with the Lexbor framework. \ No newline at end of file diff --git a/source/examples/html/element_create.md b/source/examples/html/element_create.md new file mode 100644 index 0000000..dd827ff --- /dev/null +++ b/source/examples/html/element_create.md @@ -0,0 +1,37 @@ +# HTML Element Creation Example + +This article explains the implementation of creating and appending HTML elements in a document using the respective Lexbor library. The example provided is from the source file `lexbor/html/element_create.c`. + +## Introduction + +The code demonstrates how to initialize an HTML document, create various HTML elements using their tag IDs, and manage them within a document structure. The main functionalities utilized include parsing an empty HTML document, creating elements, and preserving the overall tree structure through serialization. + +## Code Overview + +1. **Initialization**: + The code begins with the necessary includes and the definition of the `main` function. It declares necessary pointers to hold the document, body element, and tags. + +2. **Parse Document**: + The function `parse` is called with an empty string, initializing an HTML document. This is essential for setting up a base where elements can be created and manipulated. + +3. **Accessing the Body Element**: + The body of the document is obtained using `lxb_html_document_body_element(document)`, allowing further manipulations to be performed on this node. + +4. **Creating Elements**: + A loop iterates over all tag IDs defined by the Lexbor library, from `LXB_TAG_A` to `LXB_TAG__LAST_ENTRY`. For each tag: + - The tag name is retrieved using `lxb_tag_name_by_id`. + - An element is created with `lxb_dom_document_create_element`. This function constructs the DOM element based on the tag name. + - If the tag is identified as void (such as `
` or ``), it is created without a text node. Conversely, non-void tags generate text nodes through `lxb_dom_document_create_text_node`, allowing text content to be associated with those elements. + +5. **Inserting Elements into the Tree**: + Each created element is serialized for output and then inserted into the body of the document using `lxb_dom_node_insert_child`. + +6. **Final Output**: + After all elements are created and appended, the updated document tree is printed to show the result of the insertions. + +7. **Cleanup**: + Finally, the allocated document is destroyed using `lxb_html_document_destroy` to prevent memory leaks. + +## Conclusion + +This program effectively showcases the process of dynamically creating HTML elements using the Lexbor library. It covers the aspects of parsing, element creation, manipulation, and serialization, providing an essential toolkit for developers looking to work with HTML structures programmatically. The inclusion of error handling ensures reliability, allowing developers to catch and address potential issues during element creation. \ No newline at end of file diff --git a/source/examples/html/element_innerHTML.md b/source/examples/html/element_innerHTML.md new file mode 100644 index 0000000..02740ac --- /dev/null +++ b/source/examples/html/element_innerHTML.md @@ -0,0 +1,59 @@ +# Setting innerHTML Example + +This article will explain the `innerHTML` manipulation in the context of the Lexbor HTML parser, as illustrated in the source file `lexbor/html/element_innerHTML.c`. This example demonstrates how to parse HTML content, modify an element's inner HTML, and serialize the result. + +## Code Overview + +The code starts with the inclusion of the necessary header file, `base.h`, which likely contains the essential definitions and functions for the Lexbor library. The `main` function serves as the entry point for the execution of this program. + +### HTML Parsing + +The program begins by defining a simple HTML string containing a `
` with a nested `` element. The length of this string is calculated using `sizeof(html) - 1` to exclude the null terminator from the count. The predefined HTML string is as follows: + +```c +static const lxb_char_t html[] = "
blah-blah-blah
"; +``` + +Next, the `parse` function is called with the HTML string and its length. This function processes the HTML and generates a document object model (DOM), representing the structure of the HTML document in memory. + +### Printing the Parsed HTML + +The program checks the output of the `parse` function and prints the original HTML and the resulting DOM tree. This is accomplished with the `PRINT` macro, which appears to be a utility for outputting messages. The serialized DOM is obtained using the `serialize` function on the document's root node: + +```c +PRINT("HTML:"); +PRINT("%s", (const char *) html); +PRINT("\nTree after parse:"); +serialize(lxb_dom_interface_node(document)); +``` + +### Inner HTML Modification + +Subsequently, a second HTML string is defined, which will be set as the inner HTML of the body element. This inner HTML is specified as follows: + +```c +static const lxb_char_t inner[] = "
  • 1
  • 2
  • 3
"; +``` + +The program retrieves the body element of the document using `lxb_html_document_body_element(document)`. The inner HTML of the body is then set using the `lxb_html_element_inner_html_set` function, which takes the body element and the inner HTML string along with its length as arguments: + +```c +element = lxb_html_element_inner_html_set(lxb_html_interface_element(body), + inner, inner_len); +``` + +If the `element` is `NULL`, indicating a failure in setting the inner HTML, a failure message is printed through the `FAILED` macro. + +### Final Output + +After setting the inner HTML, the program serializes the modified DOM tree and prints the result. This demonstrates the changes made by the inner HTML operation. Finally, the code cleans up by destroying the document to release resources allocated for the DOM. + +```c +PRINT("\nTree after innerHTML set:"); +serialize(lxb_dom_interface_node(document)); +lxb_html_document_destroy(document); +``` + +## Conclusion + +The example provided illustrates how to parse an HTML string, modify an element's inner HTML content, and serialize the resulting DOM structure using Lexbor's capabilities. This demonstrates an essential functionality often used in web development for DOM manipulation, showcasing the ease of use of the Lexbor library for such tasks. \ No newline at end of file diff --git a/source/examples/html/elements_by_attr.md b/source/examples/html/elements_by_attr.md new file mode 100644 index 0000000..0a8aeab --- /dev/null +++ b/source/examples/html/elements_by_attr.md @@ -0,0 +1,131 @@ +# Retrieving Elements by Attribute Example + +This article will explain the functionality and implementation of the code found in **lexbor/html/elements_by_attr.c**, which demonstrates how to retrieve DOM elements based on specific attributes using the lexbor library. + +## Overview + +The provided code showcases how to extract elements from an HTML document based on their attributes. It specifically focuses on obtaining elements by 'class' and 'href' attributes, employing methods that match, search from the beginning, and search from the end of the attribute values. + +## Code Breakdown + +### Including Necessary Headers + +The code starts with including essential headers: + +```c +#include "base.h" +#include +``` + +The `base.h` header seems to contain definitions and functions crucial for this example, while `lexbor/dom/dom.h` provides the necessary DOM manipulations for lexbor. + +### Print Collection Function + +The function `print_collection_elements` is defined to handle the output of the retrieved elements: + +```c +static void print_collection_elements(lxb_dom_collection_t *collection) +``` + +This function loops through the elements within the provided collection using its length and utilizes the `serialize_node` function to print each element. After processing, it ensures to clean up the collection to prevent memory leaks. + +### Main Function Execution + +The `main` function is where the key processes occur: + +```c +int main(int argc, const char *argv[]) +``` + +#### Parsing HTML + +The HTML content is defined statically: + +```c +const lxb_char_t html[] = "
" +"
" +"
" +"ref" +"
"; +``` + +This string contains several `
` and `` tags with diverse class attributes and an `href`. The length of this HTML string is then calculated. + +#### Creating Document and Collection + +Following that, the HTML is parsed, creating a document object: + +```c +document = parse(html, html_szie); +``` + +Next, a collection object is created that will hold the elements found based on the attribute queries: + +```c +collection = lxb_dom_collection_make(&document->dom_document, 128); +``` + +A check is performed to ensure that the collection was created successfully. + +#### Searching Elements by Attributes + +The program performs several searches: + +1. **Full Match:** + Using `lxb_dom_elements_by_attr`, it searches for elements with the exact class `red c++ best`: + + ```c + status = lxb_dom_elements_by_attr(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "red c++ best", 12, + true); + ``` + + If the search is successful, the found elements are printed. + +2. **From Beginning:** + The code retrieves elements with an `href` that starts with `http`: + + ```c + status = lxb_dom_elements_by_attr_begin(body, collection, + (const lxb_char_t *) "href", 4, + (const lxb_char_t *) "http", 4, + true); + ``` + +3. **From End:** + This search targets elements with classes ending in `grep`: + + ```c + status = lxb_dom_elements_by_attr_end(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "grep", 4, + true); + ``` + +4. **Contain:** + Finally, it looks for elements where the class contains the substring `c++ b`: + + ```c + status = lxb_dom_elements_by_attr_contain(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "c++ b", 5, + true); + ``` + +Each of these searches utilizes the collection to retrieve relevant elements, printing them as they are found. + +#### Cleanup + +After the searches, cleanup processes are executed to free the allocated resources: + +```c +lxb_dom_collection_destroy(collection, true); +lxb_html_document_destroy(document); +``` + +This is critical for maintaining memory hygiene in C programs. + +## Conclusion + +This code snippet demonstrates how to efficiently query and manipulate DOM elements in an HTML document using the lexbor library. By utilizing various search strategies based on attributes, developers can effectively streamline their DOM interactions, showcasing the flexibility and power of the lexbor library for handling HTML content. \ No newline at end of file diff --git a/source/examples/html/elements_by_class_name.md b/source/examples/html/elements_by_class_name.md new file mode 100644 index 0000000..57abd22 --- /dev/null +++ b/source/examples/html/elements_by_class_name.md @@ -0,0 +1,80 @@ +# Getting Elements by Class Name Example + +In this article, we will explore the implementation details and functionality of the `elements_by_class_name` example, found in the `lexbor/html/elements_by_class_name.c` source file. The code demonstrates how to parse an HTML string and retrieve elements with a specific class name using the lexbor library. This example is essential for developers seeking to manipulate and query DOM elements in a structured manner. + +## Overview + +The `main` function begins by initializing variables, including `status`, `element`, `document`, and `collection`. It then assigns an HTML string to the `html` variable, which contains multiple `
` elements with various class names. The length of the HTML string is calculated and stored in `html_size`. + +```c +const lxb_char_t html[] = "
" + "
" + "
" + "
"; + +size_t html_size = sizeof(html) - 1; +``` + +## Parsing the HTML Document + +Next, the code invokes the `parse` function to parse the HTML string and create a DOM document. This document serves as the basis for subsequent operations on the DOM elements contained within the HTML. + +```c +document = parse(html, html_size); +``` + +## Creating a Collection for DOM Elements + +Once the document is obtained, the next step is to create a collection to hold the elements retrieved by class name. The `lxb_dom_collection_make` function is called with the document's DOM and an initial capacity of 128. If the collection cannot be created, an error message is triggered. + +```c +collection = lxb_dom_collection_make(&document->dom_document, 128); +if (collection == NULL) { + FAILED("Failed to create Collection object"); +} +``` + +## Retrieving Elements by Class Name + +The `lxb_dom_elements_by_class_name` function enables the search for elements with a specified class name. In this instance, it looks for elements with the class name "best". The function leverages the interface of the document's body to initiate the retrieval process and populate the `collection`. + +```c +status = lxb_dom_elements_by_class_name(lxb_dom_interface_element(document->body), + collection, (const lxb_char_t *) "best", 4); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +``` + +After ensuring the retrieval is successful, the code proceeds to print the original HTML and details about the found elements. + +```c +PRINT("HTML:"); +PRINT("%s", (const char *) html); +PRINT("\nFind all 'div' elements by class name 'best'."); +PRINT("Elements found:"); +``` + +## Serializing and Printing Found Elements + +A loop iterates through the collection of found elements, invoking the `serialize_node` function to output each element's details. This demonstrates how easy it is to interact with the elements returned by the class name query. + +```c +for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { + element = lxb_dom_collection_element(collection, i); + serialize_node(lxb_dom_interface_node(element)); +} +``` + +## Cleanup + +Finally, the `collection` and `document` are cleaned up to free allocated resources. This step is crucial for managing memory within the application, especially when dealing with large or complex documents. + +```c +lxb_dom_collection_destroy(collection, true); +lxb_html_document_destroy(document); +``` + +## Conclusion + +The `elements_by_class_name` example illustrates how to use the lexbor library to parse HTML content, search for elements by class name, and efficiently manage those elements. The critical sections of the code demonstrate proper document handling, error management, and systematic cleanup, providing a solid foundation for developers exploring DOM manipulation within C. \ No newline at end of file diff --git a/source/examples/html/elements_by_tag_name.md b/source/examples/html/elements_by_tag_name.md new file mode 100644 index 0000000..ff18ade --- /dev/null +++ b/source/examples/html/elements_by_tag_name.md @@ -0,0 +1,115 @@ +# HTML Elements by Tag Name Example + +This article will explain the code found in the source file `lexbor/html/elements_by_tag_name.c`, which demonstrates how to find and print HTML elements by their tag names using the Lexbor DOM library. + +## Code Overview + +The purpose of this example is to parse a simple HTML string and retrieve all `
` elements from the parsed document. It achieves this by leveraging the Lexbor library's DOM capabilities to manage and manipulate the HTML document structure. + +## Main Function + +The entry point of the program is the `main` function, which begins by declaring several variables essential for the parsing process: + +- `status` stores the success or failure status of various operations. +- `element` will point to the current HTML element being processed. +- `document` links to the HTML document that will be created from the parsed input. +- `collection` is intended to hold the collection of elements found in the document. + +### Parsing HTML + +The HTML string defined as: + +```c +const lxb_char_t html[] = "
"; +``` + +represents a simple HTML fragment which contains two `
` elements and a `` element. The size of the HTML string is determined next: + +```c +size_t html_size = sizeof(html) - 1; +``` + +This allows the program to recognize the length of the string without including the null terminator. + +The `parse` function is then called to create a `document` from the HTML string: + +```c +document = parse(html, html_size); +``` + +This function interprets the HTML and constructs a corresponding DOM structure. The parsing outcome is crucial; it will dictate the next steps in the program. + +### Creating a DOM Collection + +A collection is created to hold the resulting nodes: + +```c +collection = lxb_dom_collection_make(&document->dom_document, 128); +``` + +This function attempts to allocate memory for a collection that can store up to 128 DOM elements. If memory allocation fails, the program exits with an error message: + +```c +if (collection == NULL) { + FAILED("Failed to create Collection object"); +} +``` + +### Retrieving Elements by Tag Name + +The critical operation of this example is retrieving `
` elements from the document: + +```c +status = lxb_dom_elements_by_tag_name(lxb_dom_interface_element(document->body), + collection, (const lxb_char_t *) "div", 3); +``` + +Here, `lxb_dom_elements_by_tag_name` takes three parameters: +1. The reference to the body of the document. +2. The collection object to store the found elements. +3. The string `"div"` along with its length, specifying which tags to search for. + +If the call is unsuccessful, it again exits with an error message: + +```c +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +``` + +### Output the Found Elements + +The program then prints the initial HTML string and displays a message indicating that it is about to list the found `
` elements: + +```c +PRINT("HTML:"); +PRINT("%s", (const char *) html); +PRINT("\nFind all 'div' elements by tag name 'div'."); +PRINT("Elements found:"); +``` + +The elements collected are iterated over and serialized for display: + +```c +for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { + element = lxb_dom_collection_element(collection, i); + serialize_node(lxb_dom_interface_node(element)); +} +``` + +This loop retrieves each element from the collection by index and uses the `serialize_node` function to output its representation. + +### Cleanup + +Finally, memory allocated for the collection and the document is released: + +```c +lxb_dom_collection_destroy(collection, true); +lxb_html_document_destroy(document); +``` + +This ensures that there are no memory leaks after the program's execution is complete. + +## Conclusion + +This example serves as a practical demonstration of how to use the Lexbor library to parse HTML and find elements by tag name. By using functions from the library's API, the code effectively processes a document and manages collections of elements, showcasing the utility of the Lexbor framework in web development tasks. \ No newline at end of file diff --git a/source/examples/html/encoding.md b/source/examples/html/encoding.md new file mode 100644 index 0000000..57f6e82 --- /dev/null +++ b/source/examples/html/encoding.md @@ -0,0 +1,41 @@ +# HTML Encoding Example + +This article provides an explanation for the HTML Encoding example found in the file `lexbor/html/encoding.c`. This program is designed to read an HTML file, determine its character encoding, and print it out. The implementation utilizes the Lexbor library, which offers various functions to handle encoding. + +## Overview + +The main function of the example handles command-line input, reads an HTML file, and determines its encoding using the Lexbor library. The code includes a failure handling mechanism and a usage function to guide users on how to execute the program properly. + +## Key Code Sections + +### Error Handling Macro + +The `FAILED` macro is a pivotal part of this code, providing a consistent way to handle errors throughout the program. It takes two parameters: a boolean flag `with_usage` and a variable number of arguments. If an error occurs, it prints the provided error message to the standard error stream and, if requested, displays the usage information before quitting the program. This helps keep the code clean while managing multiple error points effectively. + +### Command-Line Arguments + +In the `main` function, the program checks the number of command-line arguments passed to it. If the argument count does not equal 2, the program calls the `usage` function to provide instructions on how to execute the program correctly and then exits. This ensures that users understand how to use the program before any further processing occurs. + +### Reading the HTML File + +The program reads the HTML file specified in the command-line argument using the `lexbor_fs_file_easy_read` function. It stores the content in a dynamic array and checks for successful reading. If the file cannot be read, it invokes the `FAILED` macro with an appropriate error message, ensuring that the program does not proceed with `NULL` data. + +### Initializing HTML Encoding + +The core logic for handling character encoding begins with the initialization of the `lxb_html_encoding_t` struct via the `lxb_html_encoding_init` function. This struct is essential for managing encoding data throughout the program. If initialization fails, the program handles the error gracefully using the `FAILED` macro again. + +### Determining Encoding + +The most crucial part of the program is determining the HTML encoding with the `lxb_html_encoding_determine` function. This function analyzes the passed HTML data to determine its encoding. In the previous comment section, there is a mention of a 1024-byte limit, which reflects a common optimization practice where a program doesn't need to read the entire file if a meta encoding tag is typically found within the first 1024 bytes. However, this section is commented out, meaning the program currently reads the complete content. + +### Printing the Encoding + +Once the encoding is determined, the program retrieves the encoding entry using `lxb_html_encoding_meta_entry`. If a valid entry is found, it prints the encoding name. If no encoding is determined, it simply outputs that the encoding was not found. This provides the user with understandable feedback regarding the HTML file's character encoding. + +### Cleanup + +At the end of the program, whether successful or in the case of an error, memory cleanup is performed. The `lexbor_free` function is called to release the allocated memory for the HTML content, and `lxb_html_encoding_destroy` cleans up the encoding struct. This is an important step to prevent memory leaks and ensure proper resource management. + +## Conclusion + +The HTML Encoding example demonstrates essential practices such as error handling, memory management, and the use of a library to enhance functionality. By following this example, developers can understand how to utilize the Lexbor library for encoding detection in HTML documents, while also adhering to proper coding standards for readability and maintainability. \ No newline at end of file diff --git a/source/examples/html/html2sexpr.md b/source/examples/html/html2sexpr.md new file mode 100644 index 0000000..39f7327 --- /dev/null +++ b/source/examples/html/html2sexpr.md @@ -0,0 +1,89 @@ +# HTML to S-Expression Converter Example + +This article provides an overview of a code example found in the file `lexbor/html/html2sexpr.c`. The program is designed to convert an HTML tag tree into an S-expression string and output it to standard output. The program utilizes the Lexbor library to handle parsing and manipulating HTML documents. + +## Overview + +The program first checks if the correct number of command-line arguments is provided. It expects one argument: the path to an HTML file. It reads the contents of this file and initializes an HTML document object using Lexbor's API. After parsing the HTML, the program invokes a tree-walking function to serialize the HTML structure into an S-expression format. The serialized output is then printed to the console. + +## Major Code Sections + +### Argument Handling and File Reading + +The `main` function begins with argument validation. It ensures that exactly one argument is received; otherwise, it calls the `usage` function, which prints the program's usage instructions to standard error. + +```c +if (argc != 2) { + usage(); + FAILED("Invalid number of arguments"); +} +``` + +Upon validation, the program proceeds to read the HTML file using the `lexbor_fs_file_easy_read` function, which simplifies file reading: + +```c +html = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &html_len); +``` + +If file reading fails, it reports an error and resizes relevant resources. + +### HTML Document Initialization and Parsing + +Next, the code creates an HTML document object with `lxb_html_document_create`. If this allocation fails, it destroys any previously allocated document and frees the memory associated with the HTML content: + +```c +document = lxb_html_document_create(); +``` + +After successfully creating the document, the program parses the HTML content: + +```c +status = lxb_html_document_parse(document, html, html_len); +``` + +This step processes the HTML string and builds a structured representation of the document. + +### Traversing the DOM and Serializing to S-Expression + +The `tree_walker` function is the core of the serialization process. It traverses the DOM tree recursively, converting each node into an S-expression format. + +It begins by checking the type of each node. For elements, it calls the serialization callback `cb` to append the opening parenthesis, the node's name, and any attributes: + +```c +if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + status = cb((const lxb_char_t *) "(", 1, ctx); + ... + // Invokes the attributes function + status = attributes(node, cb, ctx); +``` + +The `attributes` function iterates through each node's attributes and formats them as `(attribute_name 'attribute_value)` pairs, again using the callback to transmit this information. + +### Handling Template Nodes + +The `tree_walker` function includes logic to handle nodes of type `LXB_TAG_TEMPLATE`. If a node is a template and contains child nodes, it recursively calls `tree_walker` on them, ensuring that the contents of the template are also serialized: + +```c +if (node->local_name == LXB_TAG_TEMPLATE) { + ... + if (temp->content->node.first_child != NULL) { + status = tree_walker(&temp->content->node, cb, ctx); + } +} +``` + +### Cleanup and Exit Status + +After serialization is complete, the `main` function cleans up by destroying the document and freeing allocated memory. The program concludes by returning an appropriate exit status based on whether the operations succeeded or failed: + +```c +lxb_html_document_destroy(document); +lexbor_free(html); +return EXIT_SUCCESS; +``` + +In the case of failure at any point, the program proceeds to the `failed` label, ensuring resources are released before terminating. + +## Conclusion + +This example demonstrates a straightforward implementation of converting an HTML document structure into S-expressions using the Lexbor library. The program is structured to handle input validation, document parsing, tree traversal, and serialization efficiently while providing clear feedback in the case of errors. It showcases the use of Lexbor's DOM manipulation capabilities and highlights how to build a recursive tree-walking algorithm for tree serialization. \ No newline at end of file diff --git a/source/examples/html/parse.md b/source/examples/html/parse.md new file mode 100644 index 0000000..af79018 --- /dev/null +++ b/source/examples/html/parse.md @@ -0,0 +1,29 @@ +# HTML Parsing and Serialization Example + +This example demonstrates how to create an HTML parser using the lexbor library, parse simple HTML strings into document objects, and serialize those documents back to a readable format. The code is found in the source file `lexbor/html/parse.c`. + +## Code Overview + +The program begins by including the necessary header files and defining the main function, which is the entry point for execution. It declares several variables that will be needed throughout the parsing process, including the status of the parser, pointers to HTML document objects, and the HTML strings to be parsed. + +## Initialization + +First, the HTML parser is created with `lxb_html_parser_create()`, which allocates memory for the parser. It is essential to check that the parser was created successfully. The program initializes the parser with `lxb_html_parser_init(parser)`, and again checks for successful initialization. If there is a failure at either point, a failure message is printed, and the process is terminated. This aspect of the code ensures that the parser is correctly set up before proceeding further. + +## Parsing HTML + +Next, the program prepares two simple HTML snippets for parsing: `html_one` and `html_two`. These strings represent basic HTML structures containing a `div` with a `p` element. The lengths of these strings are calculated to facilitate parsing. + +The parsing occurs with `lxb_html_parse(parser, html_one, html_one_len)`, which attempts to parse the first HTML string and store the resulting document object in `doc_one`. A similar approach is taken for `doc_two`. In both cases, it is crucial to verify that the parsing was successful—if either document object is `NULL`, the program reports a failure. + +## Serialization + +Once both documents are successfully created, the program proceeds to serialize them. The method `lxb_html_serialize_pretty_tree_cb()` is called for each document. This function is responsible for converting the document object back into a structured HTML format, with an option for pretty printing. The first argument converts the document into a DOM node interface, while the remaining arguments provide options for serialization. Again, the program checks the status to ensure serialization succeeded. + +## Cleanup + +After serialization, it is important to clean up resources. The program destroys the parser and the HTML document objects with `lxb_html_parser_destroy()` and `lxb_html_document_destroy()`, respectively. This step prevents memory leaks and ensures that all allocated resources are properly released. + +## Conclusion + +This example is a clear demonstration of the workflow when utilizing the lexbor library for HTML parsing and serialization. By handling initialization, parsing, serialization, and cleanup, the program effectively showcases how to work with HTML documents in a structured manner. The checks for status at each stage ensure robustness, making it easier to identify issues during development. \ No newline at end of file diff --git a/source/examples/html/parse_chunk.md b/source/examples/html/parse_chunk.md new file mode 100644 index 0000000..87fa83d --- /dev/null +++ b/source/examples/html/parse_chunk.md @@ -0,0 +1,79 @@ +# HTML Chunk Parsing Example + +This article provides an overview of the HTML chunk parsing example implemented in the source file `lexbor/html/parse_chunk.c`. The example demonstrates how to utilize the Lexbor HTML parsing library to handle HTML data in incremental chunks. By breaking the input into smaller pieces, it showcases the parser's versatility and ability to manage partial data streams effectively. + +## Code Overview + +The main function serves as the entry point for the program. Here, several significant components of the Lexbor library are employed, such as creating a parser, managing HTML documents, and serializing the parsed content. + +### Initialization + +The first step involves initializing the parser: + +```c +parser = lxb_html_parser_create(); +status = lxb_html_parser_init(parser); +``` + +In this section, `lxb_html_parser_create()` is called to create a new HTML parser instance. It's crucial to check if the parser was successfully created by examining `status`. If initialization fails, a failure message is displayed. + +### Parsing Chunks + +After initialization, the code prepares to parse the HTML content chunk by chunk: + +```c +document = lxb_html_parse_chunk_begin(parser); +``` + +This line initializes parsing by creating a document object that will hold the parsed data. If the document object is not successfully created, an error message is emitted, halting further execution. + +The program then enters a loop to process the defined HTML chunks stored in a static array: + +```c +for (size_t i = 0; html[i][0] != '\0'; i++) { + status = lxb_html_parse_chunk_process(parser, html[i], + strlen((const char *) html[i])); + if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML chunk"); + } +} +``` + +Here, `lxb_html_parse_chunk_process()` is called for each chunk of HTML until the end of the array is reached. The function takes two parameters: the parser instance and the length of each HTML chunk. If parsing any chunk fails, it reports the error via the `FAILED` macro. + +### Finishing the Parsing + +After processing all the chunks, the parsing is concluded with: + +```c +status = lxb_html_parse_chunk_end(parser); +``` + +This function finalizes the parsing operation. Like the other stages, it checks if the operation succeeded, and handles any errors accordingly. + +### Serialization + +Once the parsing is complete, the document's contents need to be serialized: + +```c +status = lxb_html_serialize_pretty_tree_cb(lxb_dom_interface_node(document), + LXB_HTML_SERIALIZE_OPT_UNDEF, + 0, serializer_callback, NULL); +``` + +This line serializes the parsed HTML tree into a human-readable format. The `lxb_dom_interface_node(document)` retrieves the root node of the parsed document for serialization. The use of the callback function allows for customization in how the output is processed. + +### Cleanup + +Finally, resource management is handled to prevent memory leaks: + +```c +lxb_html_document_destroy(document); +lxb_html_parser_destroy(parser); +``` + +These calls ensure that the allocated parser and document objects are properly destroyed, freeing resources that are no longer needed. + +## Conclusion + +The example provided in `lexbor/html/parse_chunk.c` is a straightforward illustration of how to parse HTML data incrementally with the Lexbor library. By breaking the input into manageable chunks, the parser can efficiently handle larger HTML documents and offers developers flexibility when processing dynamic or streamed data. This method is particularly useful in web environments where HTML content may not always be available as a single, complete document. \ No newline at end of file diff --git a/source/examples/html/tokenizer/callback.md b/source/examples/html/tokenizer/callback.md new file mode 100644 index 0000000..31e2731 --- /dev/null +++ b/source/examples/html/tokenizer/callback.md @@ -0,0 +1,40 @@ +# HTML Tokenizer Callback Example + +This article describes the implementation of an HTML Tokenizer Callback found in the `lexbor/html/tokenizer/callback.c` source file. The purpose of this code is to demonstrate how to parse an HTML string and handle tokens as they are generated. It establishes a callback mechanism that is invoked after each token is processed, allowing for custom processing or logging of token data. + +## Overview + +The code begins by including necessary headers and defining a macro to handle error reporting. It then implements a token callback function, `token_callback`, which retrieves the tag name from a token, determines if the token represents a closing tag, and prints relevant details. The main function orchestrates the creation, initialization, and execution of the tokenizer. + +## Error Handling Macro + +The code defines a macro, `FAILED`, which simplifies error reporting and exits the program when an error occurs. This macro takes a format string and variadic arguments, prints the error message to standard error, and terminates the program with `EXIT_FAILURE`. This approach centralizes error handling and ensures that the program stops execution on critical failures. + +## Token Callback Function + +The function `token_callback` is critical as it processes each token generated by the tokenizer. It accepts three parameters: a pointer to the tokenizer, a pointer to the current token, and a context pointer (which is unused in this case). + +Within `token_callback`, the tag name is obtained using `lxb_tag_name_by_id`. If the tag name cannot be retrieved, the macro `FAILED` is invoked to log the error and exit. The token's type is checked to see if it indicates a closing tag. The results, including the tag name, ID, and whether it is a closing tag, are printed to standard output. + +## Main Function Execution Flow + +The `main` function contains several key operations: + +1. **Creating and Initializing the Tokenizer**: + The tokenizer is created using `lxb_html_tokenizer_create` and initialized with `lxb_html_tokenizer_init`. If any of these operations fail, the `FAILED` macro is invoked. + +2. **Setting the Token Callback**: + The tokenizer's callback function is set using `lxb_html_tokenizer_callback_token_done_set`, linking the tokenizer to the `token_callback` function defined earlier. + +3. **Beginning the Tokenization Process**: + The tokenization process is initiated with `lxb_html_tokenizer_begin`. This prepares the tokenizer for consuming HTML data. + +4. **Processing HTML Data**: + The provided HTML string (`"
test
"`) is processed by calling `lxb_html_tokenizer_chunk`, which reads a chunk of HTML to tokenize. After processing, the tokenizer is signaled to end its operation with `lxb_html_tokenizer_end`. + +5. **Cleanup**: + Finally, the tokenizer is destroyed using `lxb_html_tokenizer_destroy`, freeing up any resources allocated during its operation. + +## Summary + +This example illustrates the use of a callback function within a tokenizer to process HTML tokens sequentially. By gracefully handling errors and providing hooks for further processing, the code affords flexibility and clarity in parsing HTML inputs using the lexbor library. It exemplifies best practices in resource management, modular function design, and effective error handling in C. \ No newline at end of file diff --git a/source/examples/html/tokenizer/simple.md b/source/examples/html/tokenizer/simple.md new file mode 100644 index 0000000..8612647 --- /dev/null +++ b/source/examples/html/tokenizer/simple.md @@ -0,0 +1,37 @@ +# HTML Tokenizer Example + +This article provides a detailed explanation of an HTML tokenizer example implemented in C, demonstrating the capabilities of the lexbor library through the file `lexbor/html/tokenizer/simple.c`. This code is intended to parse a simple HTML string and display the tokens generated by the tokenizer. + +## Code Overview + +The main function of this code, `main`, initializes the tokenizer, sets a callback for token processing, and then processes a predefined HTML string. The tokenizer handles the parsing by breaking the HTML into tokens, which are processed by the `token_callback` function. + +### Tokenization Process + +1. **Initialization**: The tokenizer is created and initialized with `lxb_html_tokenizer_create()` and `lxb_html_tokenizer_init()`. If initialization fails, an error message is printed using the `FAILED` macro, which handles error reporting and exits the program. + +2. **Token Callback**: The `token_callback` function is registered as a callback to handle tokens generated by the tokenizer. This function processes different types of tokens: + - **End of File Token**: If the token indicates the end of the input, the function simply returns it. + - **Text Token**: If it is a text token, the function prints the text content enclosed by the appropriate markers. + - **HTML Tags**: For opening and closing tags, the function prints the tag name. If there are attributes, it prints them along with their values. The handling of attribute values takes care of different quoting styles (e.g., single or double quotes). + +### Main Function Execution + +The main logic begins with defining an HTML string using an array of characters. It prints the original HTML for clarity: + +```c +const lxb_char_t data[] = "
© Hi" + " my friend
"; +``` + +3. **Begin Tokenization**: After setting up the tokenizer and establishing the callback function, the tokenization process begins with `lxb_html_tokenizer_begin(tkz)`, which prepares the tokenizer to accept input. + +4. **Input Chunk Processing**: The example HTML data is passed to the tokenizer using `lxb_html_tokenizer_chunk(tkz, data, (sizeof(data) - 1))`. The tokenizer processes this chunk of data, generating tokens as defined in the callback function. + +5. **Ending Tokenization**: After processing the input, the tokenizer is finalized with `lxb_html_tokenizer_end(tkz)`, ensuring that all tokens are properly flushed and processed. + +6. **Cleanup**: Finally, the tokenizer resources are released with `lxb_html_tokenizer_destroy(tkz)` to prevent memory leaks. + +## Conclusion + +The provided example illustrates the basic functionality of the lexbor HTML tokenizer. It demonstrates how to set up a tokenizer, process HTML data, and define a callback to handle tokenization events. This example can serve as a foundation for more complex HTML processing tasks using lexbor, which is designed to efficiently handle HTML parsing requirements. \ No newline at end of file diff --git a/source/examples/html/tokenizer/tag_attributes.md b/source/examples/html/tokenizer/tag_attributes.md new file mode 100644 index 0000000..d5703c6 --- /dev/null +++ b/source/examples/html/tokenizer/tag_attributes.md @@ -0,0 +1,90 @@ +# Tokenization and Attribute Extraction Example + +This article explains the code found in the `tag_attributes.c` file of the lexbor project, which focuses on the tokenization of HTML content and the extraction of attributes from tokens. The primary purpose of this code is to parse a small fragment of HTML and output the attributes associated with each token. + +## Overview + +The `tag_attributes.c` file implements a simple HTML tokenizer. It initializes a tokenizer instance, feeds it some HTML data, and uses a callback function to process and display the attributes of parsed tokens. The tokenizer effectively handles different HTML tags and their attributes while logging any potential errors that may occur during the process. + +## Code Breakdown + +### Includes and Macros + +The file begins with including necessary header files: + +```c +#include "lexbor/html/tokenizer.h" +#include "lexbor/html/token_attr.h" +``` + +These headers provide definitions and functionalities related to HTML tokenization and attribute handling. + +The `FAILED` macro is defined to streamline error handling throughout the code. This macro takes a format string and variable arguments, prints the error message to standard error, and exits the program if an issue arises. + +### Token Callback Function + +The core of the token processing logic is in the `token_callback` function: + +```c +static lxb_html_token_t * +token_callback(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx) +``` + +This function is called whenever a token is completed. It first checks if the token is a text node or has no attributes: + +```c +if (token->tag_id == LXB_TAG__TEXT || attr == NULL) { + return token; +} +``` + +If the token is a text node or has no attributes, the function returns immediately without further processing. Otherwise, it retrieves the name of the tag associated with the token using `lxb_tag_name_by_id`. A failure at this point will invoke the `FAILED` macro: + +```c +tag = lxb_tag_name_by_id(token->tag_id, NULL); +if (tag == NULL) { + FAILED("Failed to get token name"); +} +``` + +Assuming the tag name retrieval is successful, it prints out the tag's attributes. The `while` loop iterates through the list of attributes associated with the token: + +```c +while (attr != NULL) { + name = lxb_html_token_attr_name(attr, NULL); +``` + +For each attribute found, it checks if the name is valid; if not, it acknowledges the situation by noting that the name is not set, particularly handling tokens like `DOCTYPE`. The associated values of the attributes are likewise printed if they exist. + +### Main Function + +The `main` function orchestrates the entire process: + +```c +int main(int argc, const char *argv[]) +``` + +This function initializes the tokenizer and sets up the HTML string for parsing. The HTML fragment being parsed includes a `div` tag with several attributes and nested `option` tags. It first prints the HTML string to the console: + +```c +const lxb_char_t data[] = "
" + "" + ""; +``` + +Next, it creates and initializes the tokenizer: + +```c +tkz = lxb_html_tokenizer_create(); +status = lxb_html_tokenizer_init(tkz); +``` + +In case of an error during the tokenizer's creation or initialization, it utilizes the `FAILED` macro to handle the error appropriately. + +The callback function for token completion is set, and the tokenizer begins processing the HTML data. It processes the input by calling `lxb_html_tokenizer_chunk`, and if any issues arise during these stages, the `FAILED` macro is utilized once more to identify failures in parsing. + +Finally, the tokenizer is destroyed, freeing any resources it allocated during its execution, and the program returns 0, indicating a successful run. + +## Conclusion + +This example illustrates the process of HTML tokenization using the lexbor library. By implementing a callback to handle parsed tokens, the code effectively extracts and displays attribute names and values from the given HTML fragment. It showcases the ability to manage errors gracefully while providing informative output for attribute processing within tokens. \ No newline at end of file diff --git a/source/examples/html/tokenizer/text.md b/source/examples/html/tokenizer/text.md new file mode 100644 index 0000000..fb361b5 --- /dev/null +++ b/source/examples/html/tokenizer/text.md @@ -0,0 +1,91 @@ +# HTML Tokenizer Example + +This article describes the functionality of the example code provided in the file `lexbor/html/tokenizer/text.c`. The code implements an HTML tokenizer using the Lexbor library, focusing on extracting and printing text tokens from HTML input. + +## Overview of the Code + +The main thrust of this code is to parse HTML data, identify text tokens within it, and print those tokens to the standard output. The code utilizes functions provided by the Lexbor library, a lightweight and efficient HTML and XML processing library. + +## Key Sections of the Code + +### Header and Macros + +The code begins with the inclusion of the `lexbor/html/tokenizer.h` header file, which contains the necessary declarations for using the tokenizer functionality of the Lexbor library. Following this, a macro named `FAILED` is defined. This macro can be used throughout the code to simplify error handling: + +```c +#define FAILED(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + exit(EXIT_FAILURE); \ + } \ + while (0) +``` + +It takes a format string and arguments to generate error messages. When invoked, it prints the message to standard error and terminates the program. + +### Token Callback Function + +Next, there is the `token_callback` function that manages the processing of tokens emitted by the tokenizer: + +```c +static lxb_html_token_t * +token_callback(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx) +{ + /* Skip all not #text tokens */ + if (token->tag_id != LXB_TAG__TEXT) { + return token; + } + + printf("%.*s", (int) (token->text_end - token->text_start), + token->text_start); + + return token; +} +``` + +The function checks whether the token is a text token (identified by `LXB_TAG__TEXT`). If it is not, it simply returns the token without further processing. For text tokens, it prints the text content to standard output using the `printf` function. This content is extracted from the token's `text_start` and `text_end` fields, which indicate the starting and ending positions of the text within the HTML data. + +### Main Function + +Finally, the `main` function orchestrates the tokenizer's operation: + +```c +int main(int argc, const char *argv[]) +{ + ... + const lxb_char_t data[] = "
Hi my friend
! " + "Try ent" + "ities!"; + + ... + + tkz = lxb_html_tokenizer_create(); + status = lxb_html_tokenizer_init(tkz); + if (status != LXB_STATUS_OK) { + FAILED("Failed to create tokenizer object"); + } + ... + + status = lxb_html_tokenizer_chunk(tkz, data, (sizeof(data) - 1)); + if (status != LXB_STATUS_OK) { + FAILED("Failed to parse the html data"); + } + + ... + + lxb_html_tokenizer_destroy(tkz); + + return 0; +} +``` + +The HTML input is defined as a character array that includes HTML elements and character references. The code creates a tokenizer instance using `lxb_html_tokenizer_create()` and initializes it with `lxb_html_tokenizer_init()`. If these operations fail, the `FAILED` macro is called to report the issue and exit. + +The tokenizer callback is set through `lxb_html_tokenizer_callback_token_done_set()`, linking the `token_callback` function to handle tokens once they are fully parsed. The main parsing operations occur through `lxb_html_tokenizer_begin()` and `lxb_html_tokenizer_chunk()`, processing the data until the end of the input with `lxb_html_tokenizer_end()`. + +Finally, the tokenizer instance is destroyed with `lxb_html_tokenizer_destroy(tkz)`, which frees up resources allocated during the process. + +## Conclusion + +This example provides a clear illustration of how to utilize the Lexbor library to parse HTML and process text tokens. By focusing on text tokens, and employing proper error handling mechanics, the code demonstrates a concise yet effective approach to basic HTML tokenization. \ No newline at end of file diff --git a/source/examples/index.md b/source/examples/index.md new file mode 100644 index 0000000..4e9ef05 --- /dev/null +++ b/source/examples/index.md @@ -0,0 +1,71 @@ +--- +orphan: true +--- + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +CSS: + +```{toctree} +:maxdepth: 1 +:glob: + +css/* +css/selectors/* +css/syntax/* +css/syntax/tokenizer/* +``` + +Encoding: + +```{toctree} +:maxdepth: 1 +:glob: + +encoding/* +encoding/buffer/* +encoding/buffer/decode/* +encoding/buffer/encode/* +encoding/single/* +encoding/single/decode/* +encoding/single/encode/* +``` + +HTML: + +```{toctree} +:maxdepth: 1 +:glob: + +html/* +html/tokenizer/* +``` + +Unicode and Punycode: + +```{toctree} +:maxdepth: 1 +:glob: + +punycode/* +unicode/* +``` + +Selectors and styles: + +```{toctree} +:maxdepth: 1 +:glob: + +selectors/* +styles/* +``` + +URLs: + +```{toctree} +:maxdepth: 1 +:glob: + +url/* +``` diff --git a/source/examples/punycode/decode.md b/source/examples/punycode/decode.md new file mode 100644 index 0000000..d6b86c7 --- /dev/null +++ b/source/examples/punycode/decode.md @@ -0,0 +1,81 @@ +# Punycode Decoding Example + +This article explains the implementation of a Punycode decoding utility found in the `lexbor/punycode/decode.c` file. The code example facilitates the decoding of encoded domain names into their regular representation, which is critical for handling internationalized domain names (IDNs). + +## Overview + +The core function of this program reads input from standard input, decodes it using the Lexbor library's Punycode functionality, and outputs the decoded string to standard output. Below, we detail the main components of the code, their functionality, and the logic behind the operations. + +## Main Function + +The `main` function serves as the entry point of the program. It sets up the necessary variables and handles the reading, reallocating, and decoding of data. + +### Variable Declarations + +The program begins by declaring several important variables: + +- `loop`: A boolean flag to control the reading loop. +- `size` and `nsize`: Size variables for managing buffer sizes. +- `status`: A variable to hold the status returned by functions. +- `inbuf`: A temporary buffer for reading input. +- Pointers `buf`, `end`, `p`, and `tmp`: For managing dynamic memory. + +### Memory Allocation + +Memory is allocated for `buf` using `lexbor_malloc`, which allocates space equal to the size of `inbuf`. If memory allocation fails, the program outputs an error message and exits with `EXIT_FAILURE`. + +### Reading Input + +The program enters a `do-while` loop to read from standard input: + +```c +size = fread(inbuf, 1, sizeof(inbuf), stdin); +``` + +If the read operation does not return the full buffer size, it checks if the end of the file (EOF) is reached or if an error occurred. In either case, the program handles these conditions appropriately. + +### Buffer Management + +Before storing more data into `buf`, the program checks if there is enough space: + +```c +if (p + size > end) { + nsize = (end - buf) * 3; + tmp = lexbor_realloc(buf, nsize); + ... +} +``` + +If there isn’t sufficient space, it reallocates memory to increase the buffer size by threefold. If this operation fails, an error message is displayed and the program jumps to the `failed` label to free allocated memory and exit. + +### Input Cleaning + +After reading input, the program checks and trims any trailing newline (`\n`) or carriage return (`\r`) characters for proper formatting before decoding begins. + +### Decoding Process + +The actual decoding is performed by the `lxb_punycode_decode` function, which takes the prepared buffer and calls a callback function: + +```c +status = lxb_punycode_decode(buf, p - buf, callback, NULL); +``` + +This function executes the decoding, and if it fails, an error message is printed, and cleanup is performed. + +### Output and Cleanup + +Once decoding is successful, the program prints a newline for formatting and then frees the allocated memory before exiting successfully. + +## Callback Function + +The `callback` function is defined to handle the output of each decoded segment. It receives the decoded data and its length, printing it to standard output: + +```c +printf("%.*s", (int) len, (const char *) data); +``` + +This function is simple yet crucial, as it formats and handles how the decoded data is displayed. + +## Conclusion + +This example demonstrates how to utilize the Lexbor library for Punycode decoding in C. The program handles memory management, input reading, and decoding efficiently while ensuring robustness against common issues like memory allocation failures. Through this utility, developers can work with internationalized domain names effectively, translating them into human-readable forms. \ No newline at end of file diff --git a/source/examples/punycode/encode.md b/source/examples/punycode/encode.md new file mode 100644 index 0000000..d369a19 --- /dev/null +++ b/source/examples/punycode/encode.md @@ -0,0 +1,93 @@ +# Punycode Encoding Example + +This article discusses the code example found in the file `lexbor/punycode/encode.c`, which demonstrates how to encode a string using the Punycode algorithm with the lexbor library. Punycode is a way to represent Internationalized Domain Names (IDNs) using only ASCII characters. This code facilitates reading input data, manages memory allocation dynamically, and encodes the input using a callback function to handle the output. + +## Code Explanation + +The main function plays a central role in this example. It starts by defining several variables for handling the buffer, input data, and status codes. An important portion of the code is responsible for memory management, particularly the allocation and potential reallocation of memory needed to store the input. + +### Memory Allocation + +The first crucial step involves allocating memory for the buffer, which will hold the input data. The `lexbor_malloc` function is called to allocate memory equivalent to the size of `inbuf`. If the allocation fails, an error message is printed, and the program exits with `EXIT_FAILURE`. + +```c +buf = lexbor_malloc(sizeof(inbuf)); +if (buf == NULL) { + printf("Failed memory allocation.\n"); + return EXIT_FAILURE; +} +``` + +### Reading Input + +The program uses a loop to read input from standard input using `fread`. It attempts to read up to `sizeof(inbuf)` bytes into `inbuf`. After reading, it checks if the end of the file is reached and appropriately modifies the loop control variable. + +```c +size = fread(inbuf, 1, sizeof(inbuf), stdin); +if (size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + return EXIT_FAILURE; + } +} +``` + +### Handling Buffer Overflow + +Another significant section of the code checks whether the size of the input exceeds the buffer's capacity. If it does, it reallocates memory for the buffer using `lexbor_realloc`, aiming to increase its size by a multiple of three. This is a proactive approach to accommodating larger inputs. + +```c +if (p + size > end) { + nsize = (end - buf) * 3; + + tmp = lexbor_realloc(buf, nsize); + if (tmp == NULL) { + printf("Failed memory reallocation.\n"); + goto failed; + } + + p = tmp + (p - buf); + buf = tmp; + end = tmp + nsize; +} +``` + +### Encoding Input + +Once the input is collected and appropriately buffered, the code trims any trailing newline or carriage return characters. It then calls the `lxb_punycode_encode` function, passing the buffer and the length of the data, as well as a callback function to handle the encoded output. + +```c +status = lxb_punycode_encode(buf, p - buf, callback, NULL); +if (status != LXB_STATUS_OK) { + printf("Failed decode.\n"); + goto failed; +} +``` + +The callback function `callback` is defined later in the file. It simply prints the encoded data back to standard output, handling any Unicode to ASCII conversions that may be necessary. + +```c +static lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx, bool unchanged) +{ + printf("%.*s", (int) len, (const char *) data); + + return LXB_STATUS_OK; +} +``` + +### Cleanup and Error Handling + +Throughout the code, error handling is emphasized. If any memory operation fails, the program exits gracefully by freeing any allocated memory before termination. This ensures that the application does not lead to memory leaks. + +```c +failed: + lexbor_free(buf); + return EXIT_FAILURE; +``` + +## Conclusion + +This article provides a comprehensive overview of the `lexbor/punycode/encode.c` example, illustrating how to implement Punycode encoding in C. The example highlights important practices such as dynamic memory management, error handling, and the use of callback functions, which are all vital when dealing with input and output in systems programming. By following this structured approach, developers can efficiently utilize the lexbor library to handle Internationalized Domain Names. \ No newline at end of file diff --git a/source/examples/selectors/easy_way.md b/source/examples/selectors/easy_way.md new file mode 100644 index 0000000..daeadd6 --- /dev/null +++ b/source/examples/selectors/easy_way.md @@ -0,0 +1,37 @@ +# CSS Selectors Usage Example + +This article explains an example program found in the file `lexbor/selectors/easy_way.c`, which demonstrates how to use the Lexbor library to parse HTML and match it against CSS selectors. The example involves creating an HTML document, defining CSS selectors, and then finding matching nodes in the document. + +## Overview of the Code + +The program begins with the inclusion of necessary headers from the Lexbor library, specifically for handling HTML documents and CSS selectors. The primary functionalities are encapsulated in multiple functions, including the `callback` function, which prints matched nodes, and the `find_callback` function, which keeps track of the count of found nodes. + +### Function Definitions + +- **callback**: This function acts as a callback for serializing HTML nodes. It takes a pointer to data representing the node's content and its length, printing the content to the standard output. + +- **find_callback**: This callback function is invoked for each matching node found by the CSS selectors. It increments the count of matched nodes, prints the count, and calls the serialization callback to output the node’s content. + +### Main Function Breakdown + +1. **Initialization**: The `main` function begins by declaring variables for counting matches, managing the status of various operations, and holding references to the document, selectors, parser, and selector list. + +2. **HTML and CSS Data**: The example defines a string of HTML containing a `div` with two `p` elements and a string of CSS selectors to match. Specifically, the selectors include a class selector (`.x`) and a compound selector that checks for a `p` element with an `id` of 'y'. + +3. **Creating an HTML Document**: An HTML document object is created and initialized with the HTML string. The document must be parsed successfully; otherwise, the program exits with a failure status. + +4. **CSS Parser Setup**: A CSS parser object is created and initialized, which is necessary for processing the selector strings. + +5. **Selectors Creation**: A selectors object is initialized to handle the parsing of the CSS selectors. This involves calling `lxb_selectors_create` and then initializing it with `lxb_selectors_init`. + +6. **Parsing Selectors**: The CSS selectors string is parsed, and a list of selectors is generated using `lxb_css_selectors_parse`. The status is checked to ensure that parsing was successful. + +7. **Serialization of Selectors**: The program prints out the serialized selectors using `lxb_css_selector_serialize_list_chain`, which utilizes the previously defined `callback` function to output each selector. + +8. **Finding Matching Nodes**: The program identifies the body of the HTML document and utilizes the `lxb_selectors_find` function to locate nodes that match the defined selectors. The `find_callback` function processes each matching node. + +9. **Memory Management**: After processing, the program properly deallocates memory used for selectors, the CSS parser, and the HTML document to prevent memory leaks. + +### Conclusion + +This example demonstrates the effective use of the Lexbor library for manipulating and selecting elements within HTML documents based on CSS selectors. By understanding how to parse both HTML and CSS, and by using callback functions to manage matched nodes, developers can efficiently implement feature-rich web applications. The careful structure of the code ensures maintainability and readability, adhering to best practices in C programming. \ No newline at end of file diff --git a/source/examples/selectors/normal_way.md b/source/examples/selectors/normal_way.md new file mode 100644 index 0000000..1ea3c75 --- /dev/null +++ b/source/examples/selectors/normal_way.md @@ -0,0 +1,69 @@ +# CSS Selectors Parsing and Node Finding Example + +This example, found in the source file `lexbor/selectors/normal_way.c`, demonstrates how to use the Lexbor library to parse CSS selectors and find HTML nodes that match those selectors. The code provides a comprehensive workflow, from creating an HTML document to parsing selectors and retrieving matching nodes while handling memory management efficiently. + +## Overview of Key Components + +The main function serves as the central processing unit of the code, orchestrating the various tasks. It initializes necessary structures, parses an HTML string, sets up CSS selectors, and employs the Lexbor library's capabilities to find nodes in the document. + +### HTML and CSS Data + +The example uses the HTML string `"

abc

"`, which contains two `

` elements, one with class attributes `x` and `z`, and another with the ID `y`. This HTML will be parsed to create a document object. + +Two CSS selector strings are defined: `".x, div:has(p[id=Y i])"` and `"p:blank"`. These selectors aim to demonstrate the capabilities of the library to handle various matching criteria. + +### Document Creation and Parsing + +The code begins by creating an HTML document using the function `lxb_html_document_create()`. It then parses the HTML content with `lxb_html_document_parse()`. If parsing fails (indicated by a non-OK status), the function exits, ensuring that subsequent operations are performed on a valid document. + +```c +document = lxb_html_document_create(); +status = lxb_html_document_parse(document, html, sizeof(html) / sizeof(lxb_char_t) - 1); +``` + +### Memory Management + +Proper memory management is crucial in C programming. The code allocates memory for parsed structures using `lxb_css_memory_create()`, initializing it with a specified size. This guarantees that the structures can be populated without running into memory issues. + +```c +memory = lxb_css_memory_create(); +status = lxb_css_memory_init(memory, 128); +``` + +### CSS Parser and Selector Setup + +A CSS parser is created with `lxb_css_parser_create()`, and its settings are adjusted to work with the previously created memory. The CSS selectors are set up with `lxb_css_selectors_create()` and initialized, ensuring that they can efficiently handle subsequent parsing requests. + +Important to note is the line where the parser is instructed not to create a new selector object for each call, thereby enhancing performance during parsing iterations: + +```c +lxb_css_parser_selectors_set(parser, css_selectors); +``` + +### Selector Parsing and Serialization + +The selectors defined earlier are parsed using `lxb_css_selectors_parse()`. The resulting lists (`list_one` and `list_two`) contain the parsed representations of the selectors. If parsing fails, the program exits gracefully. + +After parsing, the example demonstrates HTML serialization through `lxb_html_serialize_pretty_deep_cb()` and outputs the selectors using `lxb_css_selector_serialize_list_chain()`, allowing for a visual check of the parsed structures. + +### Finding Nodes by Selectors + +The example then proceeds to find HTML nodes using the parsed selectors. It leverages the `lxb_selectors_find()` function, along with a callback function `find_callback`, to process each matching node. This function simply counts the nodes found and prints their representation. + +```c +status = lxb_selectors_find(selectors, body, list_one, find_callback, &count); +``` + +### Cleanup and Memory Deallocation + +Once all operations are completed, the code carefully deallocates all allocated resources to prevent memory leaks. It uses the appropriate destroy functions for each created object, adhering to good practices in C coding. + +```c +(void) lxb_selectors_destroy(selectors, true); +(void) lxb_css_parser_destroy(parser, true); +(void) lxb_css_memory_destroy(memory, true); +``` + +## Conclusion + +In summary, this example outlines a practical implementation of HTML and CSS handling using the Lexbor library. It emphasizes the importance of robust memory management, selector parsing, and node finding functionalities, making it a valuable reference for developers looking to understand or utilize Lexbor in their projects. \ No newline at end of file diff --git a/source/examples/selectors/unique_nodes.md b/source/examples/selectors/unique_nodes.md new file mode 100644 index 0000000..97f95dc --- /dev/null +++ b/source/examples/selectors/unique_nodes.md @@ -0,0 +1,41 @@ +# CSS Selectors and HTML Node Selection Example + +This article discusses the functionality of the `unique_nodes.c` source file, which implements a basic example of parsing HTML and CSS selectors using the lexbor library. The example illustrates how to create an HTML document, parse CSS selectors, and find nodes within the document that match those selectors. + +## Key Components + +### HTML and CSS Data + +At the beginning of the main function, HTML and CSS data are defined. The HTML consists of a `

` containing two `

` elements, while the CSS contains several selectors, including class selectors, id selectors, and pseudo-class selectors. This data is crucial as it lays the groundwork for the subsequent parsing and node selection processes. + +### Creating an HTML Document + +The code then creates an HTML document using `lxb_html_document_create()` and populates it with the previously defined HTML data. The `lxb_html_document_parse()` function is called to parse the HTML data into a structured format. If parsing fails, the program exits with a failure status. This step transforms the provided HTML string into a DOM (Document Object Model) representation that can be interacted with programmatically. + +### Creating a CSS Parser + +Following the creation of the HTML document, a CSS parser is instantiated with `lxb_css_parser_create()`. This is complemented by an initialization call to `lxb_css_parser_init()`. The parser is necessary for interpreting the CSS selectors provided in the string format. The proper functioning of the parsing depends on successful initialization, and any failure at this stage leads to an exit. + +### CSS Selector Processing + +A CSS selector object is created using `lxb_css_selectors_create()`, and similarly initialized to prepare for parsing operations. It is important to note that the program avoids creating new selector objects each time the parser is called by setting the CSS selectors on the parser with `lxb_css_parser_selectors_set()`. This optimization ensures efficient memory usage and performance. + +### Parsing the Selectors + +The CSS selectors are parsed using `lxb_css_selectors_parse()`, which generates a list of selectors ready for matching with the document's nodes. If parsing fails, the program exits. This list is critical for the next steps, allowing the program to identify nodes that match the defined selectors. + +### Serializing HTML and Selectors + +The program outputs the serialized format of the HTML document using `lxb_html_serialize_pretty_deep_cb()`, which calls a callback function to print each node. This is useful for visual verification of the document structure. Similarly, the selectors are serialized with `lxb_css_selector_serialize_list_chain()`, enabling the user to see which selectors have been parsed and are ready for matching. + +### Finding HTML Nodes + +The core functionality of this example is encapsulated in the `lxb_selectors_find()` function, which takes the selectors and attempts to match them against the nodes in the document’s body. A callback function, `find_callback`, is provided to handle each found node, incrementing a count and processing each matched node individually. If any part of this process fails, the program suitably returns an error status. + +### Cleanup + +Finally, the program ensures that all allocated resources are correctly disposed of. Various destroy functions are called for the selectors, CSS parser, and the HTML document to prevent memory leaks. This step is essential in any robust application to maintain system performance and reliability. + +## Conclusion + +The `unique_nodes.c` example illustrates a practical application of the lexbor library to handle HTML documents and CSS selectors. By showcasing the entire lifecycle from parsing HTML to finding nodes based on CSS selectors, this example serves as an informative foundation for developers looking to work with document structures and styles in C using the lexbor library. The implemented logic emphasizes efficiency and clarity, ensuring that the handling of selectors and nodes is both effective and straightforward. \ No newline at end of file diff --git a/source/examples/styles/attribute_style.md b/source/examples/styles/attribute_style.md new file mode 100644 index 0000000..ef3ba98 --- /dev/null +++ b/source/examples/styles/attribute_style.md @@ -0,0 +1,42 @@ +# CSS Style Attribute Example + +This article provides an in-depth explanation of a code example found in the `lexbor/styles/attribute_style.c` file. The purpose of this code is to demonstrate how to create an HTML document, parse a specific HTML element, retrieve its CSS style properties, and then serialize those properties for output. + +## Code Breakdown + +### Header Files and Function Definition + +The code begins with necessary includes, specifically `base.h`, along with lexbor’s HTML and CSS header files. This setup ensures that all necessary functions related to HTML document handling and CSS processing are available. + +The `callback` function serves as a utility to print CSS property declarations. It takes a character pointer `data`, the length of data `len`, and a context pointer `ctx`. It uses `printf` to output the string, formatting it based on the provided length. This function is fundamental for logging purposes throughout the serialization process. + +### Main Function + +The `main` function is where the primary logic occurs: + +1. **Document Creation**: + The first step is to create a new HTML document using `lxb_html_document_create()`. If the document fails to create, it reports an error and halts execution using the `FAILED` macro. + +2. **CSS Initialization**: + Following document creation, `lxb_html_document_css_init(doc)` initializes the CSS environment for the document. Again, a failure results in termination. + +3. **HTML Parsing**: + The code employs `lxb_html_document_parse(doc, html.data, html.length)` to parse a static HTML string that contains a `

` with CSS inline styles. The inline styles include various widths and heights in different units. This parsing step builds the DOM structure of the HTML. + +4. **Element Retrieval**: + A `lxb_dom_collection_t` is initialized to hold results. The function `lxb_dom_node_by_tag_name()` retrieves elements by their tag name, specifically targeting the `
` tag. If retrieval fails, execution is halted. + +5. **CSS Property Access**: + The example seeks to extract specific style properties from the `
`. It retrieves the `width` property by name and the `height` property by its corresponding ID using `lxb_html_element_style_by_name` and `lxb_html_element_style_by_id`, respectively. Errors during this stage lead to failure messages. + +### Serialization and Output + +After acquiring the width and height styles, the example moves to serialize these properties. The `lxb_css_rule_declaration_serialize()` function is called twice, once for each property, passing the `callback` function to handle output. The results are printed to the console, showcasing the values for both properties. + +### Cleanup + +The `lxb_dom_collection_destroy()` function cleans up the DOM collection used to store the `
` elements, while `lxb_html_document_destroy(doc)` releases the memory allocated for the document. This cleanup ensures no memory leaks occur during program execution. + +## Conclusion + +This code example illustrates how to manipulate and retrieve CSS properties from an HTML element using the lexbor library. It covers creating an HTML document, parsing content, accessing specific elements, and outputting style properties, providing a comprehensive look at handling HTML and CSS in C with lexbor. The example highlights the importance of proper resource management and error reporting within such operations, which is essential for building robust applications. \ No newline at end of file diff --git a/source/examples/styles/events_insert.md b/source/examples/styles/events_insert.md new file mode 100644 index 0000000..ff66acb --- /dev/null +++ b/source/examples/styles/events_insert.md @@ -0,0 +1,113 @@ +# Events Insert Example + +This article explains the C code found in `lexbor/styles/events_insert.c`, which demonstrates the process of manipulating HTML documents and applying CSS styles using the Lexbor library. The code operates on a simple HTML structure and applies specific styles based on a CSS stylesheet. + +## Overview + +The provided code initializes an HTML document representation, parses a predefined HTML string, applies a CSS stylesheet, and manipulates the DOM to insert a new HTML element. Here’s a breakdown of the major sections of the code. + +## Code Breakdown + +### Includes and Definitions + +The code begins with the inclusion of necessary header files from the Lexbor library, which are essential for HTML, CSS, and selector functionalities: + +```c +#include +#include +#include +``` + +These headers allow access to functions and data structures needed to create and manipulate HTML and CSS documents. + +### Callback Function + +A callback function named `callback` is implemented to handle data output when invoked. This function prints data received from serialized output processes: + +```c +lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { + printf("%.*s", (int) len, (const char *) data); + return LXB_STATUS_OK; +} +``` + +Its purpose is to print formatted strings, assisting in visual output of the document processes. + +### Main Function + +The `main` function encapsulates the program logic. It starts by defining various variables and static data for HTML and CSS. + +```c +static const lexbor_str_t html = lexbor_str("
...
"); +static const lexbor_str_t slctrs = lexbor_str("div.father {...}"); +``` + +Here, `html` contains a `
` with class "father" and some child elements, while `slctrs` defines CSS rules for styling the div and its child paragraphs. + +### Document Creation and Parsing + +An HTML document is created using: + +```c +document = lxb_html_document_create(); +``` + +The document is then parsed with the defined HTML string: + +```c +status = lxb_html_document_parse(document, html.data, html.length); +``` + +If any operation fails, the program exits to ensure that no subsequent operations are performed on an invalid document structure. + +### CSS Initialization and Parsing + +Next, the code initializes the CSS subsystem of the document: + +```c +status = lxb_html_document_css_init(document); +``` + +After this initialization, a CSS parser is created and initialized. The CSS stylesheet is parsed and attached to the HTML document: + +```c +sst = lxb_css_stylesheet_parse(parser, slctrs.data, slctrs.length); +status = lxb_html_document_stylesheet_attach(document, sst); +``` + +At this stage, all elements in the document receive styles defined in the stylesheet. + +### Element Creation and Attribute Setting + +The code then seeks to manipulate the DOM by creating a new paragraph element (`

`). This process involves setting attributes that apply styles from the stylesheet: + +```c +np = lxb_html_document_create_element(document, p_str.data, p_str.length, NULL); +attr = lxb_dom_element_set_attribute(lxb_dom_interface_element(np), class_str.data, class_str.length, best_str.data, best_str.length); +``` + +Here, the element is given a class of "best" for styling purposes, followed by another attribute for inline styling. + +### Inserting the New Element + +Once the new element is fully prepared with the appropriate attributes, it is appended to the "father" div: + +```c +lxb_html_element_insert_child(div, np); +``` + +This action makes it part of the document's tree structure, and consequently, it inherits styling based on CSS rules. + +### Final Serialization and Resource Cleanup + +The program serializes the new element and produces output that reflects the changes made: + +```c +status = lxb_html_serialize_cb(lxb_dom_interface_node(np), callback, NULL); +``` + +Finally, all allocated resources are cleaned up to prevent memory leaks by destroying collections, stylesheets, and the document itself. + +## Conclusion + +The code in `lexbor/styles/events_insert.c` illustrates an effective use of the Lexbor library to manipulate HTML and apply CSS. By parsing, creating elements, setting attributes, and attaching styles, it provides a clear example of dynamic document editing and processing. This showcases both the capabilities and convenience of the Lexbor framework in handling web technologies programmatically. \ No newline at end of file diff --git a/source/examples/styles/stylesheet.md b/source/examples/styles/stylesheet.md new file mode 100644 index 0000000..534cea3 --- /dev/null +++ b/source/examples/styles/stylesheet.md @@ -0,0 +1,135 @@ +# CSS Stylesheet Parsing and Application Example + +In this article, we will explore the implementation of CSS stylesheet parsing and application to HTML elements using the Lexbor library. The following example is derived from the source file `lexbor/styles/stylesheet.c`. The code illustrates how to create an HTML document, parse CSS styles, attach these styles to the HTML document, and finally retrieve and serialize specific style declarations from an element. + +## Overview + +The core of the example revolves around creating a minimal HTML document that contains a `

` element with inline CSS styles. The code then initializes the Lexbor HTML and CSS parsers, processes the provided CSS, and attaches the styles to the HTML document. Finally, it retrieves specific CSS properties (width and height) from the `
` element and serializes them for output. + +## Code Breakdown + +### Creating the HTML Document + +Initially, the program creates an HTML document by calling `lxb_html_document_create()`. If the document creation fails, it triggers a failure message: + +```c +doc = lxb_html_document_create(); +if (doc == NULL) { + FAILED("Failed to create HTML Document"); +} +``` + +This part is crucial as it establishes a context for parsing HTML and applying styles. + +### Initializing the CSS Parser + +Next, the code initializes the CSS system for the document with: + +```c +status = lxb_html_document_css_init(doc); +if (status != LXB_STATUS_OK) { + FAILED("Failed to CSS initialization"); +} +``` + +Proper initialization allows the program to manage CSS styles associated with the HTML document confidently. + +### Parsing the CSS Stylesheet + +The CSS stylesheet is then created and parsed. The process involves instantiating a CSS parser with: + +```c +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialization CSS parser"); +} +``` + +Once the parser is initialized, the `lxb_css_stylesheet_parse()` function gets called to parse the provided CSS string, which contains styling rules for the `
`: + +```c +sst = lxb_css_stylesheet_parse(parser, css.data, css.length); +if (sst == NULL) { + FAILED("Failed to parse CSS StyleSheet"); +} +``` + +Successfully parsing the stylesheet is essential for associating styles with the HTML elements. + +### Parsing the HTML Document + +Following the CSS parsing, the example proceeds to parse the HTML content: + +```c +status = lxb_html_document_parse(doc, html.data, html.length); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` + +This transformation processes the HTML string into a structure that can be navigated and manipulated. + +### Attaching the Stylesheet + +The program then links the parsed CSS stylesheet to the HTML document: + +```c +status = lxb_html_document_stylesheet_attach(doc, sst); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} +``` + +This attachment allows the styles to take effect when querying elements. + +### Retrieving Element Styles + +To get the styles applied to the `
`, the code initializes a collection to store the gathered elements: + +```c +memset(&collection, 0, sizeof(lxb_dom_collection_t)); + +status = lxb_dom_node_by_tag_name(lxb_dom_interface_node(doc), &collection, + str_div.data, str_div.length); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get element by name"); +} +``` + +By calling `lxb_dom_node_by_tag_name()`, the program fetches the `
` element, which is then referenced to retrieve style declarations for specific properties: + +```c +width = lxb_html_element_style_by_name(lxb_html_interface_element(div), + str_width.data, str_width.length); +height = lxb_html_element_style_by_id(lxb_html_interface_element(div), + LXB_CSS_PROPERTY_HEIGHT); +``` + +This logic effectively retrieves both width and height style settings applied to the element. + +### Serializing Styles + +To output the retrieved styles, the code serializes each one using the `lxb_css_rule_declaration_serialize()` function, which takes a callback function to handle the output: + +```c +status = lxb_css_rule_declaration_serialize(width, callback, NULL); +status = lxb_css_rule_declaration_serialize(height, callback, NULL); +``` + +Here, the `callback` function simply prints the CSS properties to the console. + +### Cleanup + +As part of good coding practice, the program ends by freeing allocated resources, ensuring there are no memory leaks: + +```c +(void) lxb_dom_collection_destroy(&collection, false); +(void) lxb_css_stylesheet_destroy(sst, true); +(void) lxb_css_parser_destroy(parser, true); +(void) lxb_html_document_destroy(doc); +``` + +## Conclusion + +The presented example demonstrates the process of parsing and applying CSS styles to an HTML document using the Lexbor library. By following through each part of the code, one can gain insights into how to effectively manage CSS properties within a structured HTML environment, allowing for flexible design and styling in modern web applications. \ No newline at end of file diff --git a/source/examples/styles/walk.md b/source/examples/styles/walk.md new file mode 100644 index 0000000..ca012b7 --- /dev/null +++ b/source/examples/styles/walk.md @@ -0,0 +1,52 @@ +# CSS Style Walking Example + +This article explains the functionality and structure of the code found in `lexbor/styles/walk.c`. The example focuses on parsing an HTML document, attaching CSS styles to an element, and traversing the applied styles. The primary goal of this example is to demonstrate how to manipulate the Document Object Model (DOM) and apply CSS styling in the Lexbor library. + +## Overview of the Code + +The provided code is organized into several key sections. Each section serves a significant purpose within the program, which includes parsing HTML, creating a CSS parser, and navigating through the styles associated with specific HTML elements. + +### Include Directives and Function Prototypes + +The code begins by including essential header files from the Lexbor library, specifically for HTML and CSS functionalities. It defines two primary callback functions: + +1. **callback**: This function is executed to print serialized CSS data. +2. **walk_cb**: This function is intended to be called for each CSS style declaration when walking through the styles applied to an HTML element. + +### Main Functionality + +The `main` function encompasses the workflow of the program, starting with the initialization of the HTML document and CSS objects. Here's a detailed breakdown of its sections: + +1. **Document Creation**: + The code allocates memory for a new HTML document using `lxb_html_document_create()`. If it fails, the program exits with an error. + +2. **CSS Initialization**: + The HTML document initiates its CSS functionality through `lxb_html_document_css_init()`. Similar to document creation, any failure leads to program termination. + +3. **HTML Parsing**: + The program parses a static HTML string containing a `
` element using `lxb_html_document_parse()`. Again, error handling ensures that the program only proceeds if parsing is successful. + +4. **CSS Parsing**: + A CSS parser is created and initialized. The program then attempts to parse a set of CSS selectors and styles. Successful parsing leads to the association of the stylesheet with the HTML document. + +5. **DOM Node Selection**: + The program searches for HTML elements using the CSS class name through `lxb_dom_node_by_class_name()`. If no elements are found or if an error occurs, the program appropriately exits. + +6. **Style Walking**: + The function `lxb_html_element_style_walk()` is called to iterate over the styles applied to the `
` element selected earlier. The `walk_cb` function is employed as a callback, allowing printing of style information. + +### Walking Through Styles + +In the `walk_cb` callback function, several actions take place: + +- The CSS rule declaration is serialized and printed using `lxb_css_rule_declaration_serialize()`. +- The name and value of each property in the style declaration are serialized and printed through `lxb_css_property_serialize_name()` and `lxb_css_property_serialize()`. This provides complete visibility into the CSS properties applied to the `
`. +- The specificity of each CSS rule, including various parameters that determine the importance and origin of the styles, is printed. + +### Resource Cleanup + +Finally, the program ensures that all allocated resources are correctly destroyed using respective cleanup functions for DOM collections, stylesheets, parsers, and the HTML document itself. This step is crucial for preventing memory leaks and ensuring efficient resource management. + +## Conclusion + +This code example highlights the integration of HTML parsing and CSS styling using the Lexbor library. By utilizing the provided functions and callback methods, developers can effectively manipulate and inspect styles associated with HTML elements. The careful arrangement of initialization, parsing, walking through styles, and resource cleanup demonstrates best practices in managing dynamic web content. \ No newline at end of file diff --git a/source/examples/unicode/idna_to_ascii.md b/source/examples/unicode/idna_to_ascii.md new file mode 100644 index 0000000..fe3abcb --- /dev/null +++ b/source/examples/unicode/idna_to_ascii.md @@ -0,0 +1,97 @@ +# IDNA to ASCII Conversion Example + +This document provides an explanation of the IDNA to ASCII conversion code example located in the `lexbor/unicode/idna_to_ascii.c` source file. The code focuses on converting Internationalized Domain Names (IDN) from their Unicode representations to ASCII, which is often required for compatibility with DNS systems. + +## Overview + +The program begins by initializing the necessary libraries and defining the main entry point. Central to the workflow is the utilization of the `lxb_unicode_idna_t` structure, which is responsible for handling the conversion process. The program reads data from standard input and manages memory dynamically to accommodate varying input sizes. + +## Code Explanation + +### Initialization + +The program begins with include directives, where it imports the lexbor unicode library. The `callback` function is declared, which is used later in the code to process the results of the conversion. + +In the `main` function, the variables are declared, and critical initialization occurs: + +```c +status = lxb_unicode_idna_init(&idna); +``` + +Here, `lxb_unicode_idna_init` initializes an IDNA object, and the program checks for successful initialization, exiting if it fails. + +### Memory Allocation + +Memory allocation is handled using the `lexbor_malloc` function. The program allocates a buffer to read input data: + +```c +buf = lexbor_malloc(sizeof(inbuf)); +``` + +If memory allocation fails, the program gracefully handles the error by cleaning up resources and terminating. + +### Input Processing Loop + +The main processing loop reads data from standard input using `fread`. It checks for end-of-file conditions and also manages buffer overflows dynamically: + +```c +if (p + size > end) { + nsize = (end - buf) * 3; + tmp = lexbor_realloc(buf, nsize); +``` + +If additional space is needed in the buffer, the program reallocates memory to ensure there is sufficient room for incoming data, multiplying the existing size by three. This approach helps accommodate larger inputs without frequent reallocations. + +### Handling Newline Characters + +Before proceeding with the IDNA conversion, the program removes trailing newline and carriage return characters from the buffer: + +```c +if (p - buf > 0) { + if (p[-1] == '\n') { + p -= 1; + } +} +``` + +This ensures that the string sent for conversion does not include unwanted whitespace or end-of-line characters, which could potentially affect the conversion. + +### IDNA Conversion + +The core functionality of the program lies in the call to `lxb_unicode_idna_to_ascii`, which performs the actual conversion from Unicode to ASCII: + +```c +status = lxb_unicode_idna_to_ascii(&idna, buf, p - buf, callback, NULL, 0); +``` + +This function takes the initialized IDNA object, the buffer of data, its length, and a callback function that will handle the output. + +### Callback Function + +The `callback` function is essential for processing the results: + +```c +static lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx) +{ + printf("%.*s", (int) len, (const char *) data); + return LXB_STATUS_OK; +} +``` + +This function simply prints the converted ASCII data to the standard output. It receives the data generated by the conversion and its length, allowing it to format the output correctly. + +### Cleanup and Exit + +Finally, the program ensures that all allocated resources are cleaned up correctly: + +```c +lexbor_free(buf); +lxb_unicode_idna_destroy(&idna, false); +``` + +The error handling also follows a similar pattern, ensuring that there are no memory leaks or dangling pointers by freeing up the allocated buffer and destroying the IDNA object. + +## Conclusion + +This IDNA to ASCII conversion example demonstrates important concepts related to memory management, input handling, and Unicode processing in C using the lexbor library. Through structured control flow and careful resource management, the program efficiently converts IDN input into a format compatible with traditional DNS systems. The use of callback functions helps in handling outputs dynamically, showcasing an effective design pattern in C programming. \ No newline at end of file diff --git a/source/examples/unicode/normalization_form.md b/source/examples/unicode/normalization_form.md new file mode 100644 index 0000000..171b129 --- /dev/null +++ b/source/examples/unicode/normalization_form.md @@ -0,0 +1,54 @@ +# Unicode Normalization Example + +This article explains the example code found in the file `lexbor/unicode/normalization_form.c`. The program demonstrates how to perform Unicode normalization using the Lexbor library, specifically focusing on four normalization forms: NFC, NFD, NFKC, and NFKD. + +## Introduction + +The code begins by including the necessary headers for Unicode functionality and encoding. It defines a Unicode string, `"ẛ̣"`, which consists of the code points `U+1E9B` (LATIN SMALL LETTER S WITH DOT ABOVE) and `U+0323` (COMBINING DOT BELOW). The program aims to normalize this string and print the results of each normalization form. + +## Main Function + +The `main` function is the entry point of the program. Here, a `lxb_unicode_normalizer_t` object is created with the function `lxb_unicode_normalizer_create()`. This object will be used to perform the normalization forms. The initialization of this object specifies the normalization form to use, starting with NFC (Normalization Form C). + +### Initialization + +After the Unicode normalizer object is successfully created, it is initialized with NFC: + +```c +status = lxb_unicode_normalizer_init(uc, LXB_UNICODE_NFC); +``` + +If the initialization fails (`status != LXB_STATUS_OK`), an error message is printed, and the program exits with a failure status. Similar checks are made after each normalization operation to handle potential errors. + +## Normalization Operations + +The code proceeds through each normalization form: NFC, NFD, NFKC, and NFKD. In each case, the following steps are performed: + +1. Set the desired normalization form using `lxb_unicode_normalization_form_set(uc, ...)`. +2. Call `lxb_unicode_normalize(...)` to perform the normalization, passing the source string, its length, a callback function to handle the result, the name of the normalization form, and a boolean indicating whether the function should show its results. + +For instance, the NFC normalization is conducted as follows: + +```c +status = lxb_unicode_normalize(uc, source, sizeof(source) - 1, callback, "NFC", true); +``` + +Each normalization form will produce a different output, reflecting how the Unicode string is represented under various normalization rules. The callback function processes the normalized output. + +## Callback Function + +The `callback` function accepts the normalized data, its length, and a context string (the name of the normalization form). Inside this function, the received data is processed to decode valid UTF-8 sequences. It utilizes the Lexbor function `lxb_encoding_decode_valid_utf_8_single()` to decode each character code point and print it in hexadecimal format. + +### Printing the Results + +Here's how the function handles output: + +1. It prints the name of the normalization being processed. +2. It enters a loop to decode and print each code point in hexadecimal format until all data is processed. +3. Finally, it prints the original data in a string format for reference. + +## Conclusion + +After performing all normalization forms, the program cleans up by calling `lxb_unicode_normalizer_destroy(uc, true)` to free the allocated resources. It returns a success status, indicating that all operations were completed without errors. + +This example provides a practical approach to understanding how Unicode normalization works in the Lexbor library and demonstrates how to handle Unicode strings effectively. \ No newline at end of file diff --git a/source/examples/unicode/normalization_form_stdin.md b/source/examples/unicode/normalization_form_stdin.md new file mode 100644 index 0000000..2b0d569 --- /dev/null +++ b/source/examples/unicode/normalization_form_stdin.md @@ -0,0 +1,29 @@ +# Unicode Normalization Form Example + +This article describes the implementation found in the file `lexbor/unicode/normalization_form_stdin.c`. The purpose of this code example is to read input from standard input (stdin), apply a specified Unicode normalization form, and print the normalized output. The program supports four normalization forms: NFC, NFD, NFKC, and NFKD. + +## Overview of the Code + +The code begins with necessary include statements and defines the structure for the callback function. Here's a breakdown of the main parts of the code: + +### Main Function + +The `main` function serves as the entry point of the program. Its operation includes: + +1. **Argument Handling**: It verifies that at least one argument is provided to specify the normalization form. If not, it directs the flow to a usage message. The accepted arguments are either "NFC" or "NFD" for three-character forms and "NFKC" or "NFKD" for four-character forms. + +2. **Normalization Form Selection**: Depending on the command line argument, the program sets the appropriate normalization form using a series of `if` statements that compare the input string. If none of the specified forms are matched, it again leads to the usage message. + +3. **Initialization of the Normalizer**: The Unicode normalizer is created with `lxb_unicode_normalizer_create()`, followed by its initialization using `lxb_unicode_normalizer_init()`. Upon failure to initialize, the program returns an error status. + +4. **Reading Input and Normalization Loop**: The program then enters a loop where it reads data from stdin into an input buffer. Using `fread`, it checks if the end of the file (EOF) is reached or if an error occurs during reading. If data is read successfully, it passes the input to the normalization function `lxb_unicode_normalize()`, which applies the specified normalization using a callback function. + +5. **Cleanup**: After processing, it cleans up by destroying the normalizer with `lxb_unicode_normalizer_destroy()`. + +### The Callback Function + +The `callback` function is defined to handle the normalized output data. It takes the normalized data along with its length and prints it to the standard output. The format specifier `%.*s` is used to ensure that only the part of the buffer corresponding to the normalized data length is printed, handling potential null-termination issues gracefully. + +## Conclusion + +This example illustrates how to implement a basic command line utility for Unicode normalization using the lexbor library. It effectively demonstrates handling input, processing data with a normalization algorithm, and producing output. This utility can be useful in applications where consistent Unicode representation is crucial, such as in text processing and data interchange scenarios. Users can invoke the tool with specific normalization forms to transform their input accordingly. \ No newline at end of file diff --git a/source/examples/url/parse.md b/source/examples/url/parse.md new file mode 100644 index 0000000..6d4a9ff --- /dev/null +++ b/source/examples/url/parse.md @@ -0,0 +1,87 @@ +# URL Parsing Example + +This article examines a code example from the `lexbor/url/parse.c` file, focusing on URL parsing using the Lexbor library. The intent of this code is to demonstrate how to initialize the URL parser, parse a URL string, and subsequently serialize different components of the parsed URL, such as the scheme, username, password, host, and more. Each section of the code plays a critical role in handling URL data. + +## Code Breakdown + +### Initialization + +The code begins by including the necessary header for the Lexbor URL library and defining a static callback function. In the `main` function, several variables are declared, including a pointer to `lxb_url_t`, an instance of `lxb_url_parser_t`, and `lxb_unicode_idna_t`. + +```c +lxb_url_parser_t parser; +lxb_unicode_idna_t idna; +``` + +Here, `parser` is used to handle the URL parsing logic, while `idna` is utilized for Internationalized Domain Name (IDN) handling. + +### Parsing the URL + +A static constant `url_str` initializes with a URL string containing various components, including a scheme (`https`), credentials (`panda:pass`), a domain name with Unicode characters, a port number (`2030`), a path, a query parameter, and a fragment. + +```c +static const lexbor_str_t url_str = lexbor_str("https://panda:pass@тест.com:2030/path/to/hell?id=54321#comments"); +``` + +Next, the parser is initialized using the `lxb_url_parser_init` function. It is crucial to check the returned status to ensure that the parser was initialized successfully. + +```c +status = lxb_url_parser_init(&parser, NULL); +if (status != LXB_STATUS_OK) { + printf("Failed to init URL parser.\n"); + return EXIT_FAILURE; +} +``` + +If the parser fails to initialize, an error message is printed, and the program exits. + +### Executing the Parse + +The URL is parsed through `lxb_url_parse`, which processes the URL string into its various components. Again, it is crucial to validate that the parsing was successful by checking if `url` is `NULL`. + +```c +url = lxb_url_parse(&parser, NULL, url_str.data, url_str.length); +if (url == NULL) { + printf("Failed to parse URL.\n"); + return EXIT_FAILURE; +} +``` + +### Serializing URL Components + +After successful parsing, the next step involves destroying the parser to clean up resources. The code then initializes the IDNA handler, which is necessary for the following serialization of Unicode hostnames. + +```c +status = lxb_unicode_idna_init(&idna); +if (status != LXB_STATUS_OK) { + printf("Failed to init IDNA.\n"); + return EXIT_FAILURE; +} +``` + +The program outputs the original URL string and proceeds to serialize various parts of the URL. Each serialization function is linked to the previously defined `callback`, which handles the output for each component. + +- **Serialized URL**: Outputs the entire URL. +- **Scheme**: Extracts and displays only the scheme portion. +- **Username and Password**: Collects and shows the relevant sections. +- **Host**: Contains both ASCII and Unicode serialization capabilities. +- **Port, Path, Query, and Fragment**: Serializes these components in turn, showcasing all aspects of the URL. + +```c +(void) lxb_url_serialize(url, callback, NULL, false); +``` + +Each of these print statements utilizes the callback function to handle the printing of serialized data. + +### Cleanup + +Finally, the program cleans up by destroying the IDNA handler and the allocated URL memory, ensuring that no resources are leaked. + +```c +(void) lxb_unicode_idna_destroy(&idna, false); +(void) lxb_url_memory_destroy(url); +``` + +### Conclusion + +The example succinctly demonstrates the capabilities of the Lexbor URL parsing library, showcasing how to initialize the parser, handle a complex URL with Unicode characters, and serialize its components. Each part of the code works harmoniously to show how flexible and powerful URL handling can be in modern C programming with the Lexbor library. The proper initialization, error handling, and cleanup are crucial for robust application development. \ No newline at end of file diff --git a/source/examples/url/relative.md b/source/examples/url/relative.md new file mode 100644 index 0000000..a612710 --- /dev/null +++ b/source/examples/url/relative.md @@ -0,0 +1,80 @@ +# URL Parsing Example + +This article provides an explanation of the URL parsing example found in the source file `lexbor/url/relative.c`. The example demonstrates the parsing of a relative URL based on a provided base URL using the lexbor library. It outlines the setup of the URL parser, handling of input strings, and the serialization of various components of the parsed URL. + +## Code Breakdown + +### Initial Setup + +The program begins by including necessary headers and defining the callback function. The callback function serves the purpose of printing parsed URL components. The main function contains the core logic where URL parsing occurs. + +```c +static lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx); +``` + +### URL Initialization + +In `main`, variables are defined for the base URL and the URL to parse. The lexbor string structures are initialized with `url_str` and `base_url_str`. The `lxb_url_parser_t parser` is initialized to set up the parser for processing the URLs. + +```c +lxb_url_parser_t parser; +status = lxb_url_parser_init(&parser, NULL); +``` + +This initializes the parser and checks for successful initialization. If it fails, the program outputs an error message and exits. + +### Parsing Base URL + +The `base_url` is then parsed using `lxb_url_parse`, which takes the initialized parser, a null pointer (for context), the data of the base URL string, and its length. + +```c +base_url = lxb_url_parse(&parser, NULL, base_url_str.data, base_url_str.length); +``` + +If parsing the base URL fails, an error message is printed. + +### Cleaning Up and Parsing Relative URL + +Subsequently, the parser is cleaned up, and the relative URL is parsed in a similar manner using the base URL as a reference. + +```c +lxb_url_parser_clean(&parser); +url = lxb_url_parse(&parser, base_url, url_str.data, url_str.length); +``` + +Again, if the parsing fails, an appropriate error message is printed. After the relative URL is successfully parsed, the parser must be cleaned up using `lxb_url_parser_destroy`. + +### Serializing URL Components + +The main focus of this example is the serialization of various components of the parsed URL. Using callbacks, the program outputs the base URL, relative URL, and several segments of the parsed URL: + +- Scheme +- Username +- Password +- Host (both ASCII and Unicode) +- Port +- Path +- Query +- Fragment + +Each of these components is printed by invoking serialization functions, such as `lxb_url_serialize_scheme` for the scheme, and so forth. + +```c +(void) lxb_url_serialize(url, callback, NULL, false); +``` + +The callback function defined earlier is utilized here to display each component by printing its representation. + +### Final Cleanup + +After displaying all URL components, the program cleans up the IDNA context and the memory associated with the parsed URL. This ensures that any resources utilized during the parsing are properly released. + +```c +(void) lxb_unicode_idna_destroy(&idna, false); +(void) lxb_url_memory_destroy(url); +``` + +### Conclusion + +The provided example illustrates the process of relative URL parsing using the lexbor library. From initializing the parser to serializing specific components of the URL, each step is crucial for accurate URL handling in applications. The careful management of memory and resources also highlights best practices in programming with C. \ No newline at end of file diff --git a/source/index.md b/source/index.md index 9ce5201..2905bf6 100644 --- a/source/index.md +++ b/source/index.md @@ -1,6 +1,6 @@ ```{toctree} :hidden: -:maxdepth: 2 +:maxdepth: 1 documentation download @@ -84,4 +84,4 @@ repository](https://github.com/lexbor/lexbor/tree/master/examples/lexbor). We also discuss some of them in extra detail to provide insights into `lexbor` usage: -- [CSS selectors, the easy way](articles/example-CSS-selectors-easy-way) +[Examples](examples/index) From 4764dbabc3535c5952e1acf9f11dc2b125aadb19 Mon Sep 17 00:00:00 2001 From: Toxypi Date: Sat, 28 Sep 2024 08:47:25 +0100 Subject: [PATCH 2/9] Makefile: Added linkcheck. --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 9c1eaf4..e1a576a 100644 --- a/Makefile +++ b/Makefile @@ -84,3 +84,7 @@ upload: clean-doc backup deploy rsync -rctvn $(DEPLOYDIR)/ $(HOST):$(REMOTEDIR) # Final sync if dry-run is successful rsync -rctv $(DEPLOYDIR)/ $(HOST):$(REMOTEDIR) + +.PHONY: linkcheck +linkcheck: + $(call venv_exec, $(SPHINX) -b linkcheck $(SOURCEDIR) $(BUILDDIR)) From 2d27481117e2ab0858669971ff96c8a96df1470a Mon Sep 17 00:00:00 2001 From: Toxypi Date: Sat, 28 Sep 2024 08:47:45 +0100 Subject: [PATCH 3/9] Added structure, GitHub links to examples. --- source/examples/css/StyleSheet.md | 2 +- source/examples/css/index.md | 13 ++++ source/examples/css/syntax/simple_colorize.md | 2 +- .../css/syntax/structure_parse_file.md | 2 +- .../css/syntax/tokenizer/chunks_stdin.md | 2 +- .../css/syntax/tokenizer/from_file.md | 2 +- .../examples/encoding/buffer/decode/decode.md | 2 +- .../encoding/buffer/decode/decoder.md | 2 +- .../encoding/buffer/decode/validate.md | 2 +- .../examples/encoding/buffer/encode/encode.md | 2 +- .../encoding/buffer/encode/validate.md | 2 +- source/examples/encoding/buffer/from_to.md | 2 +- source/examples/encoding/data_by_name.md | 4 +- source/examples/encoding/index.md | 16 ++++ .../examples/encoding/single/decode/decode.md | 2 +- .../encoding/single/decode/decoder.md | 2 +- .../encoding/single/decode/validate.md | 2 +- .../examples/encoding/single/encode/encode.md | 2 +- .../encoding/single/encode/validate.md | 2 +- source/examples/encoding/single/from_to.md | 2 +- source/examples/html/document_parse.md | 4 +- source/examples/html/document_parse_chunk.md | 2 +- source/examples/html/document_title.md | 2 +- source/examples/html/element_attributes.md | 2 +- source/examples/html/element_create.md | 2 +- source/examples/html/element_innerHTML.md | 2 +- .../examples/html/elements_by_class_name.md | 2 +- source/examples/html/elements_by_tag_name.md | 2 +- source/examples/html/encoding.md | 2 +- source/examples/html/html2sexpr.md | 2 +- source/examples/html/index.md | 11 +++ source/examples/html/parse.md | 2 +- source/examples/html/parse_chunk.md | 4 +- source/examples/html/tokenizer/callback.md | 2 +- source/examples/html/tokenizer/simple.md | 2 +- source/examples/html/tokenizer/text.md | 2 +- source/examples/index.md | 77 +++---------------- source/examples/punycode/decode.md | 2 +- source/examples/punycode/encode.md | 4 +- source/examples/punycode/index.md | 10 +++ source/examples/selectors/easy_way.md | 2 +- source/examples/selectors/index.md | 10 +++ source/examples/selectors/normal_way.md | 2 +- source/examples/styles/attribute_style.md | 2 +- source/examples/styles/events_insert.md | 4 +- source/examples/styles/index.md | 10 +++ source/examples/styles/stylesheet.md | 2 +- source/examples/styles/walk.md | 2 +- source/examples/unicode/idna_to_ascii.md | 2 +- source/examples/unicode/index.md | 10 +++ source/examples/unicode/normalization_form.md | 2 +- .../unicode/normalization_form_stdin.md | 2 +- source/examples/url/index.md | 10 +++ source/examples/url/parse.md | 2 +- source/examples/url/relative.md | 2 +- source/index.md | 1 + 56 files changed, 153 insertions(+), 117 deletions(-) create mode 100644 source/examples/css/index.md create mode 100644 source/examples/encoding/index.md create mode 100644 source/examples/html/index.md create mode 100644 source/examples/punycode/index.md create mode 100644 source/examples/selectors/index.md create mode 100644 source/examples/styles/index.md create mode 100644 source/examples/unicode/index.md create mode 100644 source/examples/url/index.md diff --git a/source/examples/css/StyleSheet.md b/source/examples/css/StyleSheet.md index 45e4b5f..75498ca 100644 --- a/source/examples/css/StyleSheet.md +++ b/source/examples/css/StyleSheet.md @@ -1,6 +1,6 @@ # CSS Stylesheet Parsing Example -This article explains the example code within the file `lexbor/css/StyleSheet.c`, which demonstrates how to use the Lexbor library to read and parse a CSS stylesheet. The code showcases the steps required to initialize the parser, read the CSS data from a file, parse the stylesheet, and serialize the resulting object. +This article explains the example code within the file [lexbor/css/StyleSheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/StyleSheet.c), which demonstrates how to use the Lexbor library to read and parse a CSS stylesheet. The code showcases the steps required to initialize the parser, read the CSS data from a file, parse the stylesheet, and serialize the resulting object. ## Code Breakdown diff --git a/source/examples/css/index.md b/source/examples/css/index.md new file mode 100644 index 0000000..80abb10 --- /dev/null +++ b/source/examples/css/index.md @@ -0,0 +1,13 @@ +# CSS Examples + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +selectors/* +syntax/* +syntax/tokenizer/* +``` diff --git a/source/examples/css/syntax/simple_colorize.md b/source/examples/css/syntax/simple_colorize.md index f086ee7..86f30d3 100644 --- a/source/examples/css/syntax/simple_colorize.md +++ b/source/examples/css/syntax/simple_colorize.md @@ -1,6 +1,6 @@ # CSS Syntax Parsing Example -This article provides an explanation of a code example from the source file `lexbor/css/syntax/simple_colorize.c`. The code implements a simple CSS parser that reads a CSS file, parses its content, and provides color-coded output for each type of CSS rule and declaration using ANSI escape codes. +This article provides an explanation of a code example from the source file [lexbor/css/syntax/simple_colorize.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/simple_colorize.c). The code implements a simple CSS parser that reads a CSS file, parses its content, and provides color-coded output for each type of CSS rule and declaration using ANSI escape codes. ## Structure of the Program diff --git a/source/examples/css/syntax/structure_parse_file.md b/source/examples/css/syntax/structure_parse_file.md index 6ada7e9..3e79792 100644 --- a/source/examples/css/syntax/structure_parse_file.md +++ b/source/examples/css/syntax/structure_parse_file.md @@ -1,6 +1,6 @@ # CSS Syntax Parser Example -This article provides an overview of the code located in `lexbor/css/syntax/structure_parse_file.c`, which implements a CSS syntax parser using the lexbor library. The primary goal of this code is to parse CSS syntax rules and declarations, handling various states and transitions within the parsing process. +This article provides an overview of the code located in [lexbor/css/syntax/structure_parse_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/structure_parse_file.c), which implements a CSS syntax parser using the lexbor library. The primary goal of this code is to parse CSS syntax rules and declarations, handling various states and transitions within the parsing process. ## Code Overview diff --git a/source/examples/css/syntax/tokenizer/chunks_stdin.md b/source/examples/css/syntax/tokenizer/chunks_stdin.md index a387c75..146fa99 100644 --- a/source/examples/css/syntax/tokenizer/chunks_stdin.md +++ b/source/examples/css/syntax/tokenizer/chunks_stdin.md @@ -1,6 +1,6 @@ # CSS Syntax Tokenizer Example -This article explains the implementation of a CSS syntax tokenizer in the file `lexbor/css/syntax/tokenizer/chunks_stdin.c`. The code demonstrates how to read CSS data from standard input, tokenize it, and output the identified token types along with their serialized representations. +This article explains the implementation of a CSS syntax tokenizer in the file [lexbor/css/syntax/tokenizer/chunks_stdin.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/chunks_stdin.c). The code demonstrates how to read CSS data from standard input, tokenize it, and output the identified token types along with their serialized representations. ## Overview diff --git a/source/examples/css/syntax/tokenizer/from_file.md b/source/examples/css/syntax/tokenizer/from_file.md index dbd336a..e33c08c 100644 --- a/source/examples/css/syntax/tokenizer/from_file.md +++ b/source/examples/css/syntax/tokenizer/from_file.md @@ -1,6 +1,6 @@ # CSS Syntax Tokenizer Example -This article provides a detailed explanation of a CSS syntax tokenizer implemented in the file `lexbor/css/syntax/tokenizer/from_file.c`. The code serves the purpose of reading a CSS file, processing its contents to extract tokens, and producing output that describes each token. +This article provides a detailed explanation of a CSS syntax tokenizer implemented in the file [lexbor/css/syntax/tokenizer/from_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/from_file.c). The code serves the purpose of reading a CSS file, processing its contents to extract tokens, and producing output that describes each token. ## Overview diff --git a/source/examples/encoding/buffer/decode/decode.md b/source/examples/encoding/buffer/decode/decode.md index d127104..792ac9f 100644 --- a/source/examples/encoding/buffer/decode/decode.md +++ b/source/examples/encoding/buffer/decode/decode.md @@ -1,6 +1,6 @@ # UTF-8 Decoding Example -In this article, we will explore a code example from the file `lexbor/encoding/buffer/decode/decode.c` that demonstrates how to decode a UTF-8 encoded string into code points using the Lexbor library. This example specifically highlights the usage of Lexbor's encoding functionalities, providing insights into how to leverage these features for character decoding in C. +In this article, we will explore a code example from the file [lexbor/encoding/buffer/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decode.c) that demonstrates how to decode a UTF-8 encoded string into code points using the Lexbor library. This example specifically highlights the usage of Lexbor's encoding functionalities, providing insights into how to leverage these features for character decoding in C. ## Code Explanation diff --git a/source/examples/encoding/buffer/decode/decoder.md b/source/examples/encoding/buffer/decode/decoder.md index 2a57f61..4463576 100644 --- a/source/examples/encoding/buffer/decode/decoder.md +++ b/source/examples/encoding/buffer/decode/decoder.md @@ -1,6 +1,6 @@ # Unicode Decoder Example -In this article, we will discuss a simple Unicode decoder implemented in C, specifically within the context of the lexbor library. The code can be found in the source file `lexbor/encoding/buffer/decode/decoder.c`. This program is designed to take a specified character encoding from the command line, read input data, and decode it into Unicode code points, displaying the result in a format suitable for further processing or representation. +In this article, we will discuss a simple Unicode decoder implemented in C, specifically within the context of the lexbor library. The code can be found in the source file [lexbor/encoding/buffer/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decoder.c). This program is designed to take a specified character encoding from the command line, read input data, and decode it into Unicode code points, displaying the result in a format suitable for further processing or representation. ## Code Structure Overview diff --git a/source/examples/encoding/buffer/decode/validate.md b/source/examples/encoding/buffer/decode/validate.md index 78826be..74f07f8 100644 --- a/source/examples/encoding/buffer/decode/validate.md +++ b/source/examples/encoding/buffer/decode/validate.md @@ -1,6 +1,6 @@ # UTF-8 Decoding and Replacement Example -This article will explain a C code example that demonstrates UTF-8 decoding and the handling of invalid byte sequences using the lexbor library. The source file for the example is `lexbor/encoding/buffer/decode/validate.c`. +This article will explain a C code example that demonstrates UTF-8 decoding and the handling of invalid byte sequences using the lexbor library. The source file for the example is [lexbor/encoding/buffer/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/validate.c). ## Overview diff --git a/source/examples/encoding/buffer/encode/encode.md b/source/examples/encoding/buffer/encode/encode.md index 50d58b5..fe02f7c 100644 --- a/source/examples/encoding/buffer/encode/encode.md +++ b/source/examples/encoding/buffer/encode/encode.md @@ -1,6 +1,6 @@ # Encoding Unicode Code Points to UTF-8 Example -This article explains the encoding of Unicode code points to a UTF-8 byte string using the Lexbor library. The source code is located in `lexbor/encoding/buffer/encode/encode.c`. This example demonstrates how to initialize the encoder, encode Unicode code points, and handle the output appropriately. +This article explains the encoding of Unicode code points to a UTF-8 byte string using the Lexbor library. The source code is located in [lexbor/encoding/buffer/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/encode.c). This example demonstrates how to initialize the encoder, encode Unicode code points, and handle the output appropriately. ## Overview diff --git a/source/examples/encoding/buffer/encode/validate.md b/source/examples/encoding/buffer/encode/validate.md index 90b73d6..5d497c8 100644 --- a/source/examples/encoding/buffer/encode/validate.md +++ b/source/examples/encoding/buffer/encode/validate.md @@ -1,6 +1,6 @@ # Unicode Encoding Example -This article explains the functionality of a Unicode encoding example, which can be found in the source file `lexbor/encoding/buffer/encode/validate.c`. The code serves as an illustration of how to encode Unicode code points into a UTF-8 byte string using the Lexbor library. +This article explains the functionality of a Unicode encoding example, which can be found in the source file [lexbor/encoding/buffer/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/validate.c). The code serves as an illustration of how to encode Unicode code points into a UTF-8 byte string using the Lexbor library. ## Overview diff --git a/source/examples/encoding/buffer/from_to.md b/source/examples/encoding/buffer/from_to.md index 248dc8c..c84c2c4 100644 --- a/source/examples/encoding/buffer/from_to.md +++ b/source/examples/encoding/buffer/from_to.md @@ -1,6 +1,6 @@ # Encoding Conversion Example -This article describes an example of encoding conversion using the `from_to` program from the `lexbor` library, specifically found in the source file `lexbor/encoding/buffer/from_to.c`. The program reads data from the standard input, converts the data from one encoding to another (specified by the user), and outputs the result to the standard output. +This article describes an example of encoding conversion using the `from_to` program from the `lexbor` library, specifically found in the source file [lexbor/encoding/buffer/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/from_to.c). The program reads data from the standard input, converts the data from one encoding to another (specified by the user), and outputs the result to the standard output. ## Overview diff --git a/source/examples/encoding/data_by_name.md b/source/examples/encoding/data_by_name.md index c89d3a4..8de6afd 100644 --- a/source/examples/encoding/data_by_name.md +++ b/source/examples/encoding/data_by_name.md @@ -1,6 +1,6 @@ # Encoding Data Retrieval Example -This article provides an explanation of an example from the file `lexbor/encoding/data_by_name.c`. The purpose of this code is to demonstrate how to retrieve encoding data by its name using the Lexbor encoding library. The code illustrated here highlights the procedure for accessing character encoding information, specifically focusing on UTF-8. +This article provides an explanation of an example from the file [lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c). The purpose of this code is to demonstrate how to retrieve encoding data by its name using the Lexbor encoding library. The code illustrated here highlights the procedure for accessing character encoding information, specifically focusing on UTF-8. ## Code Explanation @@ -63,4 +63,4 @@ This line returns a success status to the operating system, indicating that the ## Conclusion -The example presented in `lexbor/encoding/data_by_name.c` effectively demonstrates how to access encoding data using the Lexbor encoding library. It showcases the importance of error handling and provides a simple way to retrieve and display the name of a character encoding, using UTF-8 as a practical example. This code can serve as a foundational component for applications that require encoding information for text processing. \ No newline at end of file +The example presented in [lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c) effectively demonstrates how to access encoding data using the Lexbor encoding library. It showcases the importance of error handling and provides a simple way to retrieve and display the name of a character encoding, using UTF-8 as a practical example. This code can serve as a foundational component for applications that require encoding information for text processing. \ No newline at end of file diff --git a/source/examples/encoding/index.md b/source/examples/encoding/index.md new file mode 100644 index 0000000..6875d0a --- /dev/null +++ b/source/examples/encoding/index.md @@ -0,0 +1,16 @@ +# Encoding Examples + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +buffer/* +buffer/decode/* +buffer/encode/* +single/* +single/decode/* +single/encode/* +``` diff --git a/source/examples/encoding/single/decode/decode.md b/source/examples/encoding/single/decode/decode.md index 294ec3e..fd3fede 100644 --- a/source/examples/encoding/single/decode/decode.md +++ b/source/examples/encoding/single/decode/decode.md @@ -1,6 +1,6 @@ # UTF-8 Decoding Example -This article explains a code example from `lexbor/encoding/single/decode/decode.c`, which demonstrates how to decode a UTF-8 string into its respective code points using the lexbor library. +This article explains a code example from [lexbor/encoding/single/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decode.c), which demonstrates how to decode a UTF-8 string into its respective code points using the lexbor library. ## Introduction diff --git a/source/examples/encoding/single/decode/decoder.md b/source/examples/encoding/single/decode/decoder.md index 7a039ed..5657c56 100644 --- a/source/examples/encoding/single/decode/decoder.md +++ b/source/examples/encoding/single/decode/decoder.md @@ -1,6 +1,6 @@ # Encoding Decoder Example -In this article, we will explore the encoding decoder example found in the file `lexbor/encoding/single/decode/decoder.c`. This code demonstrates how to decode input data from standard input according to a specified character encoding. It provides a useful utility for developers needing to handle various text encodings in their applications. +In this article, we will explore the encoding decoder example found in the file [lexbor/encoding/single/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decoder.c). This code demonstrates how to decode input data from standard input according to a specified character encoding. It provides a useful utility for developers needing to handle various text encodings in their applications. ## Code Overview diff --git a/source/examples/encoding/single/decode/validate.md b/source/examples/encoding/single/decode/validate.md index 76afd1e..5b04969 100644 --- a/source/examples/encoding/single/decode/validate.md +++ b/source/examples/encoding/single/decode/validate.md @@ -1,6 +1,6 @@ # UTF-8 Decoding and Validation Example -This article explains an example of decoding and validating a UTF-8 string, using the Lexbor library. The source file for this code example is `lexbor/encoding/single/decode/validate.c`. The primary objective of this code is to demonstrate how to properly decode a UTF-8 encoded string, handle decoding errors, and output both valid code points and error information for invalid byte sequences. +This article explains an example of decoding and validating a UTF-8 string, using the Lexbor library. The source file for this code example is [lexbor/encoding/single/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/validate.c). The primary objective of this code is to demonstrate how to properly decode a UTF-8 encoded string, handle decoding errors, and output both valid code points and error information for invalid byte sequences. ## Code Breakdown diff --git a/source/examples/encoding/single/encode/encode.md b/source/examples/encoding/single/encode/encode.md index 5d8d712..80581b2 100644 --- a/source/examples/encoding/single/encode/encode.md +++ b/source/examples/encoding/single/encode/encode.md @@ -1,6 +1,6 @@ # UTF-8 Encoding Example -This article explains the purpose and functionality of the UTF-8 encoding example provided in the file `lexbor/encoding/single/encode/encode.c`. The code demonstrates how to encode a series of Unicode code points into a UTF-8 byte string using the Lexbor encoding library. +This article explains the purpose and functionality of the UTF-8 encoding example provided in the file [lexbor/encoding/single/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/encode.c). The code demonstrates how to encode a series of Unicode code points into a UTF-8 byte string using the Lexbor encoding library. ## Code Overview diff --git a/source/examples/encoding/single/encode/validate.md b/source/examples/encoding/single/encode/validate.md index 9ebfa1a..eadf876 100644 --- a/source/examples/encoding/single/encode/validate.md +++ b/source/examples/encoding/single/encode/validate.md @@ -1,6 +1,6 @@ # Encoding Unicode Code Points to UTF-8 Example -This example demonstrates how to validate and encode Unicode code points into a UTF-8 byte string using the lexbor library. The functionality is encapsulated within a C program located in the `lexbor/encoding/single/encode/validate.c` file. The purpose of this code is to illustrate the encoding of a set of given code points, handling exceptions for those that are invalid by replacing them with a predefined replacement character. +This example demonstrates how to validate and encode Unicode code points into a UTF-8 byte string using the lexbor library. The functionality is encapsulated within a C program located in the [lexbor/encoding/single/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/validate.c) file. The purpose of this code is to illustrate the encoding of a set of given code points, handling exceptions for those that are invalid by replacing them with a predefined replacement character. ## Overview of the Code diff --git a/source/examples/encoding/single/from_to.md b/source/examples/encoding/single/from_to.md index 5e120af..08968fa 100644 --- a/source/examples/encoding/single/from_to.md +++ b/source/examples/encoding/single/from_to.md @@ -1,6 +1,6 @@ # Encoding Conversion Example -This article explains the encoding conversion functionality provided in the source file `lexbor/encoding/single/from_to.c`. The code allows users to convert text from one character encoding to another via command-line input. It demonstrates how to utilize the Lexbor encoding library for encoding and decoding different formats of character sets. +This article explains the encoding conversion functionality provided in the source file [lexbor/encoding/single/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/from_to.c). The code allows users to convert text from one character encoding to another via command-line input. It demonstrates how to utilize the Lexbor encoding library for encoding and decoding different formats of character sets. ## Overview diff --git a/source/examples/html/document_parse.md b/source/examples/html/document_parse.md index 6388648..e10b793 100644 --- a/source/examples/html/document_parse.md +++ b/source/examples/html/document_parse.md @@ -1,6 +1,6 @@ # HTML Document Parsing Example -This article explains an example of parsing an HTML document using the Lexbor library. The purpose of this example, located in the source file `lexbor/html/document_parse.c`, is to illustrate the steps necessary to create an HTML document, parse a string of HTML, and serialize the resulting DOM tree. +This article explains an example of parsing an HTML document using the Lexbor library. The purpose of this example, located in the source file [lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c), is to illustrate the steps necessary to create an HTML document, parse a string of HTML, and serialize the resulting DOM tree. ## Example Overview @@ -87,4 +87,4 @@ Ensuring proper resource management is important in C programming, as it helps m ## Conclusion -The example provided in `lexbor/html/document_parse.c` serves as a clear demonstration of how to create, parse, and handle an HTML document using Lexbor. Through careful initialization, parsing, result outputting, and cleanup, this code illustrates best practices for managing HTML documents in a C environment. \ No newline at end of file +The example provided in [lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c) serves as a clear demonstration of how to create, parse, and handle an HTML document using Lexbor. Through careful initialization, parsing, result outputting, and cleanup, this code illustrates best practices for managing HTML documents in a C environment. \ No newline at end of file diff --git a/source/examples/html/document_parse_chunk.md b/source/examples/html/document_parse_chunk.md index 7298f09..1525ff3 100644 --- a/source/examples/html/document_parse_chunk.md +++ b/source/examples/html/document_parse_chunk.md @@ -1,6 +1,6 @@ # HTML Document Parsing Example -This article provides an overview of an example implementation of HTML document parsing using the Lexbor library. The example is located in the source file `lexbor/html/document_parse_chunk.c`. This example demonstrates how to create an HTML document, parse it in chunks, and handle the cleaning up of allocated resources. +This article provides an overview of an example implementation of HTML document parsing using the Lexbor library. The example is located in the source file [lexbor/html/document_parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse_chunk.c). This example demonstrates how to create an HTML document, parse it in chunks, and handle the cleaning up of allocated resources. ## Code Overview diff --git a/source/examples/html/document_title.md b/source/examples/html/document_title.md index 5952f52..e1f1c49 100644 --- a/source/examples/html/document_title.md +++ b/source/examples/html/document_title.md @@ -1,6 +1,6 @@ # HTML Document Title Example -This article will explain the functionality of the HTML document title example implemented in the source code found in `lexbor/html/document_title.c`. The purpose of this code is to demonstrate how to parse an HTML string, retrieve its title, modify the title, and then display the resulting HTML document structure using the Lexbor library. +This article will explain the functionality of the HTML document title example implemented in the source code found in [lexbor/html/document_title.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_title.c). The purpose of this code is to demonstrate how to parse an HTML string, retrieve its title, modify the title, and then display the resulting HTML document structure using the Lexbor library. ## Code Breakdown diff --git a/source/examples/html/element_attributes.md b/source/examples/html/element_attributes.md index 0ef536a..8f01734 100644 --- a/source/examples/html/element_attributes.md +++ b/source/examples/html/element_attributes.md @@ -1,6 +1,6 @@ # Element Attributes Example -This article explains the implementation found in `lexbor/html/element_attributes.c`, which demonstrates how to manipulate HTML element attributes using the Lexbor library. The example outlines parsing an HTML snippet, finding an element, and performing various operations involving element attributes, such as adding, checking existence, retrieving, modifying, and removing attributes from an element. +This article explains the implementation found in [lexbor/html/element_attributes.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_attributes.c), which demonstrates how to manipulate HTML element attributes using the Lexbor library. The example outlines parsing an HTML snippet, finding an element, and performing various operations involving element attributes, such as adding, checking existence, retrieving, modifying, and removing attributes from an element. ## Code Overview diff --git a/source/examples/html/element_create.md b/source/examples/html/element_create.md index dd827ff..58ca337 100644 --- a/source/examples/html/element_create.md +++ b/source/examples/html/element_create.md @@ -1,6 +1,6 @@ # HTML Element Creation Example -This article explains the implementation of creating and appending HTML elements in a document using the respective Lexbor library. The example provided is from the source file `lexbor/html/element_create.c`. +This article explains the implementation of creating and appending HTML elements in a document using the respective Lexbor library. The example provided is from the source file [lexbor/html/element_create.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_create.c). ## Introduction diff --git a/source/examples/html/element_innerHTML.md b/source/examples/html/element_innerHTML.md index 02740ac..aeef2d8 100644 --- a/source/examples/html/element_innerHTML.md +++ b/source/examples/html/element_innerHTML.md @@ -1,6 +1,6 @@ # Setting innerHTML Example -This article will explain the `innerHTML` manipulation in the context of the Lexbor HTML parser, as illustrated in the source file `lexbor/html/element_innerHTML.c`. This example demonstrates how to parse HTML content, modify an element's inner HTML, and serialize the result. +This article will explain the `innerHTML` manipulation in the context of the Lexbor HTML parser, as illustrated in the source file [lexbor/html/element_innerHTML.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_innerHTML.c). This example demonstrates how to parse HTML content, modify an element's inner HTML, and serialize the result. ## Code Overview diff --git a/source/examples/html/elements_by_class_name.md b/source/examples/html/elements_by_class_name.md index 57abd22..4f6f10d 100644 --- a/source/examples/html/elements_by_class_name.md +++ b/source/examples/html/elements_by_class_name.md @@ -1,6 +1,6 @@ # Getting Elements by Class Name Example -In this article, we will explore the implementation details and functionality of the `elements_by_class_name` example, found in the `lexbor/html/elements_by_class_name.c` source file. The code demonstrates how to parse an HTML string and retrieve elements with a specific class name using the lexbor library. This example is essential for developers seeking to manipulate and query DOM elements in a structured manner. +In this article, we will explore the implementation details and functionality of the `elements_by_class_name` example, found in the [lexbor/html/elements_by_class_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_class_name.c) source file. The code demonstrates how to parse an HTML string and retrieve elements with a specific class name using the lexbor library. This example is essential for developers seeking to manipulate and query DOM elements in a structured manner. ## Overview diff --git a/source/examples/html/elements_by_tag_name.md b/source/examples/html/elements_by_tag_name.md index ff18ade..77c0485 100644 --- a/source/examples/html/elements_by_tag_name.md +++ b/source/examples/html/elements_by_tag_name.md @@ -1,6 +1,6 @@ # HTML Elements by Tag Name Example -This article will explain the code found in the source file `lexbor/html/elements_by_tag_name.c`, which demonstrates how to find and print HTML elements by their tag names using the Lexbor DOM library. +This article will explain the code found in the source file [lexbor/html/elements_by_tag_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_tag_name.c), which demonstrates how to find and print HTML elements by their tag names using the Lexbor DOM library. ## Code Overview diff --git a/source/examples/html/encoding.md b/source/examples/html/encoding.md index 57f6e82..526324a 100644 --- a/source/examples/html/encoding.md +++ b/source/examples/html/encoding.md @@ -1,6 +1,6 @@ # HTML Encoding Example -This article provides an explanation for the HTML Encoding example found in the file `lexbor/html/encoding.c`. This program is designed to read an HTML file, determine its character encoding, and print it out. The implementation utilizes the Lexbor library, which offers various functions to handle encoding. +This article provides an explanation for the HTML Encoding example found in the file [lexbor/html/encoding.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/encoding.c). This program is designed to read an HTML file, determine its character encoding, and print it out. The implementation utilizes the Lexbor library, which offers various functions to handle encoding. ## Overview diff --git a/source/examples/html/html2sexpr.md b/source/examples/html/html2sexpr.md index 39f7327..86751a2 100644 --- a/source/examples/html/html2sexpr.md +++ b/source/examples/html/html2sexpr.md @@ -1,6 +1,6 @@ # HTML to S-Expression Converter Example -This article provides an overview of a code example found in the file `lexbor/html/html2sexpr.c`. The program is designed to convert an HTML tag tree into an S-expression string and output it to standard output. The program utilizes the Lexbor library to handle parsing and manipulating HTML documents. +This article provides an overview of a code example found in the file [lexbor/html/html2sexpr.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/html2sexpr.c). The program is designed to convert an HTML tag tree into an S-expression string and output it to standard output. The program utilizes the Lexbor library to handle parsing and manipulating HTML documents. ## Overview diff --git a/source/examples/html/index.md b/source/examples/html/index.md new file mode 100644 index 0000000..a7ae38d --- /dev/null +++ b/source/examples/html/index.md @@ -0,0 +1,11 @@ +# HTML Examples + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +tokenizer/* +``` diff --git a/source/examples/html/parse.md b/source/examples/html/parse.md index af79018..813b49e 100644 --- a/source/examples/html/parse.md +++ b/source/examples/html/parse.md @@ -1,6 +1,6 @@ # HTML Parsing and Serialization Example -This example demonstrates how to create an HTML parser using the lexbor library, parse simple HTML strings into document objects, and serialize those documents back to a readable format. The code is found in the source file `lexbor/html/parse.c`. +This example demonstrates how to create an HTML parser using the lexbor library, parse simple HTML strings into document objects, and serialize those documents back to a readable format. The code is found in the source file [lexbor/html/parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse.c). ## Code Overview diff --git a/source/examples/html/parse_chunk.md b/source/examples/html/parse_chunk.md index 87fa83d..39a22c0 100644 --- a/source/examples/html/parse_chunk.md +++ b/source/examples/html/parse_chunk.md @@ -1,6 +1,6 @@ # HTML Chunk Parsing Example -This article provides an overview of the HTML chunk parsing example implemented in the source file `lexbor/html/parse_chunk.c`. The example demonstrates how to utilize the Lexbor HTML parsing library to handle HTML data in incremental chunks. By breaking the input into smaller pieces, it showcases the parser's versatility and ability to manage partial data streams effectively. +This article provides an overview of the HTML chunk parsing example implemented in the source file [lexbor/html/parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse_chunk.c). The example demonstrates how to utilize the Lexbor HTML parsing library to handle HTML data in incremental chunks. By breaking the input into smaller pieces, it showcases the parser's versatility and ability to manage partial data streams effectively. ## Code Overview @@ -76,4 +76,4 @@ These calls ensure that the allocated parser and document objects are properly d ## Conclusion -The example provided in `lexbor/html/parse_chunk.c` is a straightforward illustration of how to parse HTML data incrementally with the Lexbor library. By breaking the input into manageable chunks, the parser can efficiently handle larger HTML documents and offers developers flexibility when processing dynamic or streamed data. This method is particularly useful in web environments where HTML content may not always be available as a single, complete document. \ No newline at end of file +The example provided in [lexbor/html/parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse_chunk.c) is a straightforward illustration of how to parse HTML data incrementally with the Lexbor library. By breaking the input into manageable chunks, the parser can efficiently handle larger HTML documents and offers developers flexibility when processing dynamic or streamed data. This method is particularly useful in web environments where HTML content may not always be available as a single, complete document. \ No newline at end of file diff --git a/source/examples/html/tokenizer/callback.md b/source/examples/html/tokenizer/callback.md index 31e2731..1e7cf93 100644 --- a/source/examples/html/tokenizer/callback.md +++ b/source/examples/html/tokenizer/callback.md @@ -1,6 +1,6 @@ # HTML Tokenizer Callback Example -This article describes the implementation of an HTML Tokenizer Callback found in the `lexbor/html/tokenizer/callback.c` source file. The purpose of this code is to demonstrate how to parse an HTML string and handle tokens as they are generated. It establishes a callback mechanism that is invoked after each token is processed, allowing for custom processing or logging of token data. +This article describes the implementation of an HTML Tokenizer Callback found in the [lexbor/html/tokenizer/callback.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/callback.c) source file. The purpose of this code is to demonstrate how to parse an HTML string and handle tokens as they are generated. It establishes a callback mechanism that is invoked after each token is processed, allowing for custom processing or logging of token data. ## Overview diff --git a/source/examples/html/tokenizer/simple.md b/source/examples/html/tokenizer/simple.md index 8612647..773a4b6 100644 --- a/source/examples/html/tokenizer/simple.md +++ b/source/examples/html/tokenizer/simple.md @@ -1,6 +1,6 @@ # HTML Tokenizer Example -This article provides a detailed explanation of an HTML tokenizer example implemented in C, demonstrating the capabilities of the lexbor library through the file `lexbor/html/tokenizer/simple.c`. This code is intended to parse a simple HTML string and display the tokens generated by the tokenizer. +This article provides a detailed explanation of an HTML tokenizer example implemented in C, demonstrating the capabilities of the lexbor library through the file [lexbor/html/tokenizer/simple.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/simple.c). This code is intended to parse a simple HTML string and display the tokens generated by the tokenizer. ## Code Overview diff --git a/source/examples/html/tokenizer/text.md b/source/examples/html/tokenizer/text.md index fb361b5..dc94dae 100644 --- a/source/examples/html/tokenizer/text.md +++ b/source/examples/html/tokenizer/text.md @@ -1,6 +1,6 @@ # HTML Tokenizer Example -This article describes the functionality of the example code provided in the file `lexbor/html/tokenizer/text.c`. The code implements an HTML tokenizer using the Lexbor library, focusing on extracting and printing text tokens from HTML input. +This article describes the functionality of the example code provided in the file [lexbor/html/tokenizer/text.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/text.c). The code implements an HTML tokenizer using the Lexbor library, focusing on extracting and printing text tokens from HTML input. ## Overview of the Code diff --git a/source/examples/index.md b/source/examples/index.md index 4e9ef05..caaba33 100644 --- a/source/examples/index.md +++ b/source/examples/index.md @@ -1,71 +1,16 @@ ---- -orphan: true ---- +# Examples These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. -CSS: - -```{toctree} -:maxdepth: 1 -:glob: - -css/* -css/selectors/* -css/syntax/* -css/syntax/tokenizer/* -``` - -Encoding: - -```{toctree} -:maxdepth: 1 -:glob: - -encoding/* -encoding/buffer/* -encoding/buffer/decode/* -encoding/buffer/encode/* -encoding/single/* -encoding/single/decode/* -encoding/single/encode/* -``` - -HTML: - ```{toctree} -:maxdepth: 1 -:glob: - -html/* -html/tokenizer/* -``` - -Unicode and Punycode: - -```{toctree} -:maxdepth: 1 -:glob: - -punycode/* -unicode/* -``` - -Selectors and styles: - -```{toctree} -:maxdepth: 1 -:glob: - -selectors/* -styles/* -``` - -URLs: - -```{toctree} -:maxdepth: 1 -:glob: - -url/* +:maxdepth: 2 + +css/index +encoding/index +html/index +punycode/index +unicode/index +selectors/index +styles/index +url/index ``` diff --git a/source/examples/punycode/decode.md b/source/examples/punycode/decode.md index d6b86c7..aed3e20 100644 --- a/source/examples/punycode/decode.md +++ b/source/examples/punycode/decode.md @@ -1,6 +1,6 @@ # Punycode Decoding Example -This article explains the implementation of a Punycode decoding utility found in the `lexbor/punycode/decode.c` file. The code example facilitates the decoding of encoded domain names into their regular representation, which is critical for handling internationalized domain names (IDNs). +This article explains the implementation of a Punycode decoding utility found in the [lexbor/punycode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/decode.c) file. The code example facilitates the decoding of encoded domain names into their regular representation, which is critical for handling internationalized domain names (IDNs). ## Overview diff --git a/source/examples/punycode/encode.md b/source/examples/punycode/encode.md index d369a19..89fd2cc 100644 --- a/source/examples/punycode/encode.md +++ b/source/examples/punycode/encode.md @@ -1,6 +1,6 @@ # Punycode Encoding Example -This article discusses the code example found in the file `lexbor/punycode/encode.c`, which demonstrates how to encode a string using the Punycode algorithm with the lexbor library. Punycode is a way to represent Internationalized Domain Names (IDNs) using only ASCII characters. This code facilitates reading input data, manages memory allocation dynamically, and encodes the input using a callback function to handle the output. +This article discusses the code example found in the file [lexbor/punycode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/encode.c), which demonstrates how to encode a string using the Punycode algorithm with the lexbor library. Punycode is a way to represent Internationalized Domain Names (IDNs) using only ASCII characters. This code facilitates reading input data, manages memory allocation dynamically, and encodes the input using a callback function to handle the output. ## Code Explanation @@ -90,4 +90,4 @@ failed: ## Conclusion -This article provides a comprehensive overview of the `lexbor/punycode/encode.c` example, illustrating how to implement Punycode encoding in C. The example highlights important practices such as dynamic memory management, error handling, and the use of callback functions, which are all vital when dealing with input and output in systems programming. By following this structured approach, developers can efficiently utilize the lexbor library to handle Internationalized Domain Names. \ No newline at end of file +This article provides a comprehensive overview of the [lexbor/punycode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/encode.c) example, illustrating how to implement Punycode encoding in C. The example highlights important practices such as dynamic memory management, error handling, and the use of callback functions, which are all vital when dealing with input and output in systems programming. By following this structured approach, developers can efficiently utilize the lexbor library to handle Internationalized Domain Names. \ No newline at end of file diff --git a/source/examples/punycode/index.md b/source/examples/punycode/index.md new file mode 100644 index 0000000..9103ac2 --- /dev/null +++ b/source/examples/punycode/index.md @@ -0,0 +1,10 @@ +# Punycode Examples + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +``` diff --git a/source/examples/selectors/easy_way.md b/source/examples/selectors/easy_way.md index daeadd6..2db6591 100644 --- a/source/examples/selectors/easy_way.md +++ b/source/examples/selectors/easy_way.md @@ -1,6 +1,6 @@ # CSS Selectors Usage Example -This article explains an example program found in the file `lexbor/selectors/easy_way.c`, which demonstrates how to use the Lexbor library to parse HTML and match it against CSS selectors. The example involves creating an HTML document, defining CSS selectors, and then finding matching nodes in the document. +This article explains an example program found in the file [lexbor/selectors/easy_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/selectors/easy_way.c), which demonstrates how to use the Lexbor library to parse HTML and match it against CSS selectors. The example involves creating an HTML document, defining CSS selectors, and then finding matching nodes in the document. ## Overview of the Code diff --git a/source/examples/selectors/index.md b/source/examples/selectors/index.md new file mode 100644 index 0000000..a358b60 --- /dev/null +++ b/source/examples/selectors/index.md @@ -0,0 +1,10 @@ +# Selectors Examples + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +``` diff --git a/source/examples/selectors/normal_way.md b/source/examples/selectors/normal_way.md index 1ea3c75..6d07d7d 100644 --- a/source/examples/selectors/normal_way.md +++ b/source/examples/selectors/normal_way.md @@ -1,6 +1,6 @@ # CSS Selectors Parsing and Node Finding Example -This example, found in the source file `lexbor/selectors/normal_way.c`, demonstrates how to use the Lexbor library to parse CSS selectors and find HTML nodes that match those selectors. The code provides a comprehensive workflow, from creating an HTML document to parsing selectors and retrieving matching nodes while handling memory management efficiently. +This example, found in the source file [lexbor/selectors/normal_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/selectors/normal_way.c), demonstrates how to use the Lexbor library to parse CSS selectors and find HTML nodes that match those selectors. The code provides a comprehensive workflow, from creating an HTML document to parsing selectors and retrieving matching nodes while handling memory management efficiently. ## Overview of Key Components diff --git a/source/examples/styles/attribute_style.md b/source/examples/styles/attribute_style.md index ef3ba98..7c16042 100644 --- a/source/examples/styles/attribute_style.md +++ b/source/examples/styles/attribute_style.md @@ -1,6 +1,6 @@ # CSS Style Attribute Example -This article provides an in-depth explanation of a code example found in the `lexbor/styles/attribute_style.c` file. The purpose of this code is to demonstrate how to create an HTML document, parse a specific HTML element, retrieve its CSS style properties, and then serialize those properties for output. +This article provides an in-depth explanation of a code example found in the [lexbor/styles/attribute_style.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/attribute_style.c) file. The purpose of this code is to demonstrate how to create an HTML document, parse a specific HTML element, retrieve its CSS style properties, and then serialize those properties for output. ## Code Breakdown diff --git a/source/examples/styles/events_insert.md b/source/examples/styles/events_insert.md index ff66acb..cc3bbe6 100644 --- a/source/examples/styles/events_insert.md +++ b/source/examples/styles/events_insert.md @@ -1,6 +1,6 @@ # Events Insert Example -This article explains the C code found in `lexbor/styles/events_insert.c`, which demonstrates the process of manipulating HTML documents and applying CSS styles using the Lexbor library. The code operates on a simple HTML structure and applies specific styles based on a CSS stylesheet. +This article explains the C code found in [lexbor/styles/events_insert.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/events_insert.c), which demonstrates the process of manipulating HTML documents and applying CSS styles using the Lexbor library. The code operates on a simple HTML structure and applies specific styles based on a CSS stylesheet. ## Overview @@ -110,4 +110,4 @@ Finally, all allocated resources are cleaned up to prevent memory leaks by destr ## Conclusion -The code in `lexbor/styles/events_insert.c` illustrates an effective use of the Lexbor library to manipulate HTML and apply CSS. By parsing, creating elements, setting attributes, and attaching styles, it provides a clear example of dynamic document editing and processing. This showcases both the capabilities and convenience of the Lexbor framework in handling web technologies programmatically. \ No newline at end of file +The code in [lexbor/styles/events_insert.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/events_insert.c) illustrates an effective use of the Lexbor library to manipulate HTML and apply CSS. By parsing, creating elements, setting attributes, and attaching styles, it provides a clear example of dynamic document editing and processing. This showcases both the capabilities and convenience of the Lexbor framework in handling web technologies programmatically. \ No newline at end of file diff --git a/source/examples/styles/index.md b/source/examples/styles/index.md new file mode 100644 index 0000000..282f65f --- /dev/null +++ b/source/examples/styles/index.md @@ -0,0 +1,10 @@ +# Styles Examples + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +``` diff --git a/source/examples/styles/stylesheet.md b/source/examples/styles/stylesheet.md index 534cea3..03999fc 100644 --- a/source/examples/styles/stylesheet.md +++ b/source/examples/styles/stylesheet.md @@ -1,6 +1,6 @@ # CSS Stylesheet Parsing and Application Example -In this article, we will explore the implementation of CSS stylesheet parsing and application to HTML elements using the Lexbor library. The following example is derived from the source file `lexbor/styles/stylesheet.c`. The code illustrates how to create an HTML document, parse CSS styles, attach these styles to the HTML document, and finally retrieve and serialize specific style declarations from an element. +In this article, we will explore the implementation of CSS stylesheet parsing and application to HTML elements using the Lexbor library. The following example is derived from the source file [lexbor/styles/stylesheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/stylesheet.c). The code illustrates how to create an HTML document, parse CSS styles, attach these styles to the HTML document, and finally retrieve and serialize specific style declarations from an element. ## Overview diff --git a/source/examples/styles/walk.md b/source/examples/styles/walk.md index ca012b7..cf3f551 100644 --- a/source/examples/styles/walk.md +++ b/source/examples/styles/walk.md @@ -1,6 +1,6 @@ # CSS Style Walking Example -This article explains the functionality and structure of the code found in `lexbor/styles/walk.c`. The example focuses on parsing an HTML document, attaching CSS styles to an element, and traversing the applied styles. The primary goal of this example is to demonstrate how to manipulate the Document Object Model (DOM) and apply CSS styling in the Lexbor library. +This article explains the functionality and structure of the code found in [lexbor/styles/walk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/walk.c). The example focuses on parsing an HTML document, attaching CSS styles to an element, and traversing the applied styles. The primary goal of this example is to demonstrate how to manipulate the Document Object Model (DOM) and apply CSS styling in the Lexbor library. ## Overview of the Code diff --git a/source/examples/unicode/idna_to_ascii.md b/source/examples/unicode/idna_to_ascii.md index fe3abcb..62586b0 100644 --- a/source/examples/unicode/idna_to_ascii.md +++ b/source/examples/unicode/idna_to_ascii.md @@ -1,6 +1,6 @@ # IDNA to ASCII Conversion Example -This document provides an explanation of the IDNA to ASCII conversion code example located in the `lexbor/unicode/idna_to_ascii.c` source file. The code focuses on converting Internationalized Domain Names (IDN) from their Unicode representations to ASCII, which is often required for compatibility with DNS systems. +This document provides an explanation of the IDNA to ASCII conversion code example located in the [lexbor/unicode/idna_to_ascii.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/idna_to_ascii.c) source file. The code focuses on converting Internationalized Domain Names (IDN) from their Unicode representations to ASCII, which is often required for compatibility with DNS systems. ## Overview diff --git a/source/examples/unicode/index.md b/source/examples/unicode/index.md new file mode 100644 index 0000000..01af28a --- /dev/null +++ b/source/examples/unicode/index.md @@ -0,0 +1,10 @@ +# Unicode Examples + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +``` diff --git a/source/examples/unicode/normalization_form.md b/source/examples/unicode/normalization_form.md index 171b129..13c5dd0 100644 --- a/source/examples/unicode/normalization_form.md +++ b/source/examples/unicode/normalization_form.md @@ -1,6 +1,6 @@ # Unicode Normalization Example -This article explains the example code found in the file `lexbor/unicode/normalization_form.c`. The program demonstrates how to perform Unicode normalization using the Lexbor library, specifically focusing on four normalization forms: NFC, NFD, NFKC, and NFKD. +This article explains the example code found in the file [lexbor/unicode/normalization_form.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/normalization_form.c). The program demonstrates how to perform Unicode normalization using the Lexbor library, specifically focusing on four normalization forms: NFC, NFD, NFKC, and NFKD. ## Introduction diff --git a/source/examples/unicode/normalization_form_stdin.md b/source/examples/unicode/normalization_form_stdin.md index 2b0d569..cfb482a 100644 --- a/source/examples/unicode/normalization_form_stdin.md +++ b/source/examples/unicode/normalization_form_stdin.md @@ -1,6 +1,6 @@ # Unicode Normalization Form Example -This article describes the implementation found in the file `lexbor/unicode/normalization_form_stdin.c`. The purpose of this code example is to read input from standard input (stdin), apply a specified Unicode normalization form, and print the normalized output. The program supports four normalization forms: NFC, NFD, NFKC, and NFKD. +This article describes the implementation found in the file [lexbor/unicode/normalization_form_stdin.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/normalization_form_stdin.c). The purpose of this code example is to read input from standard input (stdin), apply a specified Unicode normalization form, and print the normalized output. The program supports four normalization forms: NFC, NFD, NFKC, and NFKD. ## Overview of the Code diff --git a/source/examples/url/index.md b/source/examples/url/index.md new file mode 100644 index 0000000..364aa5d --- /dev/null +++ b/source/examples/url/index.md @@ -0,0 +1,10 @@ +# URL Examples + +These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. + +```{toctree} +:maxdepth: 1 +:glob: + +* +``` diff --git a/source/examples/url/parse.md b/source/examples/url/parse.md index 6d4a9ff..b488149 100644 --- a/source/examples/url/parse.md +++ b/source/examples/url/parse.md @@ -1,6 +1,6 @@ # URL Parsing Example -This article examines a code example from the `lexbor/url/parse.c` file, focusing on URL parsing using the Lexbor library. The intent of this code is to demonstrate how to initialize the URL parser, parse a URL string, and subsequently serialize different components of the parsed URL, such as the scheme, username, password, host, and more. Each section of the code plays a critical role in handling URL data. +This article examines a code example from the [lexbor/url/parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/url/parse.c) file, focusing on URL parsing using the Lexbor library. The intent of this code is to demonstrate how to initialize the URL parser, parse a URL string, and subsequently serialize different components of the parsed URL, such as the scheme, username, password, host, and more. Each section of the code plays a critical role in handling URL data. ## Code Breakdown diff --git a/source/examples/url/relative.md b/source/examples/url/relative.md index a612710..ae21b13 100644 --- a/source/examples/url/relative.md +++ b/source/examples/url/relative.md @@ -1,6 +1,6 @@ # URL Parsing Example -This article provides an explanation of the URL parsing example found in the source file `lexbor/url/relative.c`. The example demonstrates the parsing of a relative URL based on a provided base URL using the lexbor library. It outlines the setup of the URL parser, handling of input strings, and the serialization of various components of the parsed URL. +This article provides an explanation of the URL parsing example found in the source file [lexbor/url/relative.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/url/relative.c). The example demonstrates the parsing of a relative URL based on a provided base URL using the lexbor library. It outlines the setup of the URL parser, handling of input strings, and the serialization of various components of the parsed URL. ## Code Breakdown diff --git a/source/index.md b/source/index.md index 2905bf6..f607ac4 100644 --- a/source/index.md +++ b/source/index.md @@ -6,6 +6,7 @@ documentation download roadmap articles/index +examples/index support license ``` From 4dfb8bbd31facb84781ca578a53a19d6cdd0e28d Mon Sep 17 00:00:00 2001 From: Toxypi Date: Sat, 28 Sep 2024 09:12:58 +0100 Subject: [PATCH 4/9] Makefile: Added spellcheck. --- .spellcheck_ignore.txt | 207 ++++++++++++++++++ Makefile | 9 +- source/articles/part-1-html.md | 2 +- source/articles/part-2-css.md | 14 +- source/documentation.md | 4 +- source/download.md | 4 +- .../css/syntax/structure_parse_file.md | 2 +- source/examples/punycode/decode.md | 2 +- source/examples/selectors/easy_way.md | 2 +- source/examples/selectors/unique_nodes.md | 2 +- source/examples/styles/attribute_style.md | 2 +- source/examples/styles/events_insert.md | 2 +- source/index.md | 4 +- spellcheck.yaml | 30 +++ spellcheck_dict.dic | Bin 0 -> 6224 bytes 15 files changed, 264 insertions(+), 22 deletions(-) create mode 100644 .spellcheck_ignore.txt create mode 100644 spellcheck.yaml create mode 100644 spellcheck_dict.dic diff --git a/.spellcheck_ignore.txt b/.spellcheck_ignore.txt new file mode 100644 index 0000000..80a4ca9 --- /dev/null +++ b/.spellcheck_ignore.txt @@ -0,0 +1,207 @@ +ASAN +ascii +AVL +backend +Borisov +br +BST +CentOS +CMake +codebase +Combinatorial +Combinators +combinators +CPP +css +CSSOM +csswg +customizable +CXX +deallocates +Deallocation +distros +DNS +DOCTYPE +DOM +encoding's +encodings +Encodings +EOF +EUC +Flexbox +frontend +fuzzer +fuzzers +gb +glyphs +Homebrew +hostnames +href +html +IDN +IDNA +idna +IDNs +img +initializations +innerHTML +IoT +JIS +js +JSON +keyring +lexbor +Lexbor +LEXBOR +Lexbor's +lexbor's +li +lifecycle +lightningcss +LXB +lxb +macintosh +macOS +macos +MacPorts +macports +mailto +Makefiles +malloc +maxdepth +md +mediaqueries +mem +memset +microsoft +mkdir +mq +msan +MSYS +msys +multipage +mutexes +myhtml +namespace +Namespace +Namespaces +namespaces +NFD +nFind +NFKC +NFKD +NGINX +nHTML +NJS +normalizer +Normalizer +np +nResult +nsize +nTree +num +oklab +oklch +ol +OpenType +opentype +otff +outbuf +overline +parser's +parsers +pc +png +pos +pre +Preprocessing +preprocessing +prescan +Prescanning +printf +programmatically +Punycode +punycode +px +queueing +RCDATA +realloc +reallocations +releasever +renderer +repo +repos +rfc +rgb +rgba +RHEL +rhel +Roadmap +roadmap +ru +serializer +sexpr +sizeof +slctrs +spinlocks +src +ss +sst +stderr +stdin +stdout +str +STR +strlen +struct +stylesheet +Stylesheet +StyleSheet +stylesheets +subdirectory +substring +sudo +superfast +svg +SVG +symlink +szie +textarea +tkz +tle +tmp +toctree +Tokenization +tokenization +tokenize +Tokenizer +tokenizer +tokenizer's +tokenizing +tStyleSheet +txt +typedef +UB +ubuntu +uc +UI +ul +UNDEF +unformatted +unicode +url +usr +UTF +uTf +utf +UTILS +variadic +WHATWG +whatwg +whitespace +WHITESPACE +whitespaces +WS +www +xenial +YPE \ No newline at end of file diff --git a/Makefile b/Makefile index e1a576a..4ece681 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ +PYTHON ?= python3 SPHINX ?= sphinx-build -SERVER ?= python3 -m http.server +SERVER ?= $(PYTHON) -m http.server VENVDIR ?= .venv VENV ?= $(VENVDIR)/bin/activate @@ -21,7 +22,7 @@ endef # Ensure virtual environment exists $(VENVDIR): - python3 -m venv $(VENVDIR) + $(PYTHON) -m venv $(VENVDIR) # Install dependencies inside virtual environment .PHONY: install @@ -88,3 +89,7 @@ upload: clean-doc backup deploy .PHONY: linkcheck linkcheck: $(call venv_exec, $(SPHINX) -b linkcheck $(SOURCEDIR) $(BUILDDIR)) + +.PHONY: spellcheck +spellcheck: + $(call venv_exec, $(PYTHON) -m pyspelling -c spellcheck.yaml -j $(shell nproc)) \ No newline at end of file diff --git a/source/articles/part-1-html.md b/source/articles/part-1-html.md index 3dc9506..4f9e081 100644 --- a/source/articles/part-1-html.md +++ b/source/articles/part-1-html.md @@ -166,7 +166,7 @@ In the HTML namespace, the `` tag is treated as text, so no `` element is created. In the SVG namespace, however, an element is created based on the `` tag. Thus, tags behave differently depending on the namespace. -But there’s more. The tokenizer must also be aware of the current namespace to +But there's more. The tokenizer must also be aware of the current namespace to process `CDATA` correctly. Consider two examples involving `CDATA` and two different namespaces: diff --git a/source/articles/part-2-css.md b/source/articles/part-2-css.md index 4ab8a03..74e4a45 100644 --- a/source/articles/part-2-css.md +++ b/source/articles/part-2-css.md @@ -3,7 +3,7 @@ Hello, everyone! We continue our series on developing a browser engine. Better late than never! -Despite the long break, I’ll update you on the lexbor project and its current +Despite the long break, I'll update you on the lexbor project and its current status at the end of this article. In this article, we'll explore the specifics of parsing Cascading Style Sheets @@ -41,7 +41,7 @@ Recommendation, etc. You can see all stages on marked with its current development stage, ranging from early drafts to final recommendations. -We will focus on Editor’s Draft and Working Draft with a glance at +We will focus on Editor's Draft and Working Draft with a glance at Recommendation. Since W3C standards evolve slowly, by the time a module reaches Recommendation, it might already be outdated. Thus, we'll treat CSS standards as living documents, like the HTML standard. @@ -116,7 +116,7 @@ A specific structure includes: 1. Stylesheet (formerly List of Rules) 2. At-Rule 3. Qualified Rule -4. Block’s contents +4. Block's contents 5. Declaration 6. Component value 7. Simple block @@ -242,7 +242,7 @@ Parsing selectors will proceed as follows: This sounds simple, but in practice, it's more complex: - The knowledge about which stage to switch to must be passed to each module; - they don’t inherently know this. + they don't inherently know this. - We need to decide whether to consume the `{` or `}` token before passing it to the next stage. - Nesting depth must be tracked. We can't just pass control to the next module @@ -296,7 +296,7 @@ structure. This is a form of inside-out parsing. This approach is implemented in my `lexbor` project. -Here’s how it works: We set up callbacks for different stages of parsing the CSS +Here's how it works: We set up callbacks for different stages of parsing the CSS structure. Each callback is called only once at the beginning of a stage, not for every token. @@ -608,7 +608,7 @@ All these tests are valid, but the result after parsing will always be ` = a b c`. The question arises—how do we compare this with others? My intuition suggested that the task had become significantly more complicated, but a sense of determination (not foolishness) drove me to address it directly. As expected, -it didn’t work out right away; it required some thought! +it didn't work out right away; it required some thought! Consider this example: ``` @@ -633,7 +633,7 @@ inconsistencies. The most reliable solution turned out to be generating the test and the result separately. This means that forming the result goes through the same stages as -forming the test. Although this approach is costly, it’s manageable since +forming the test. Although this approach is costly, it's manageable since real-time performance is not a constraint. As a result, we now have an excellent tool for generating tests for grammars. diff --git a/source/documentation.md b/source/documentation.md index d2fbe0f..6590e06 100644 --- a/source/documentation.md +++ b/source/documentation.md @@ -171,7 +171,7 @@ make object creation and memory management in our own way. Many classic algorithms used in `lexbor` are adapted to meet the specific needs of the project. -- We're open to using third-party code, but it’s often simpler to start from +- We're open to using third-party code, but it's often simpler to start from scratch than to add extra dependencies (looking at you, Node.js). - Some functions are platform-dependent, such as threading, timers, I/O, and @@ -324,7 +324,7 @@ void - The `*_destroy` functions always check if the object is `NULL`; if so, they return `NULL`. -- If the `*_destroy` function doesn’t take the `bool self_destroy` argument, the +- If the `*_destroy` function doesn't take the `bool self_destroy` argument, the object can only be created using the `*_create` function (i.e., not on the stack). diff --git a/source/download.md b/source/download.md index 08ceb18..7c14421 100644 --- a/source/download.md +++ b/source/download.md @@ -43,7 +43,7 @@ The `lexbor` binaries are available for: 1. Download the `lexbor` [signing key](https://lexbor.com/keys/lexbor_signing.key) used for our repositories - and add it to `apt`’s keyring: + and add it to `apt`'s keyring: ```sh curl -O https://lexbor.com/keys/lexbor_signing.key @@ -115,7 +115,7 @@ The `lexbor` binaries are available for: 1. Download the `lexbor` [signing key](https://lexbor.com/keys/lexbor_signing.key) used for our repositories - and add it to `apt`’s keyring: + and add it to `apt`'s keyring: ```sh curl -O https://lexbor.com/keys/lexbor_signing.key diff --git a/source/examples/css/syntax/structure_parse_file.md b/source/examples/css/syntax/structure_parse_file.md index 3e79792..10efbfb 100644 --- a/source/examples/css/syntax/structure_parse_file.md +++ b/source/examples/css/syntax/structure_parse_file.md @@ -22,7 +22,7 @@ The `main` function performs several key operations: ### CSS Parsing Implementation -The `css_parse` function is crucial as it sets up the parsing buffer and pushes the initial parsing rules onto a stack. Here’s a breakdown of its functionality: +The `css_parse` function is crucial as it sets up the parsing buffer and pushes the initial parsing rules onto a stack. Here's a breakdown of its functionality: - **Set Buffer**: The parsing buffer of the parser is set with the provided CSS data and its length. diff --git a/source/examples/punycode/decode.md b/source/examples/punycode/decode.md index aed3e20..2d182f3 100644 --- a/source/examples/punycode/decode.md +++ b/source/examples/punycode/decode.md @@ -46,7 +46,7 @@ if (p + size > end) { } ``` -If there isn’t sufficient space, it reallocates memory to increase the buffer size by threefold. If this operation fails, an error message is displayed and the program jumps to the `failed` label to free allocated memory and exit. +If there isn't sufficient space, it reallocates memory to increase the buffer size by threefold. If this operation fails, an error message is displayed and the program jumps to the `failed` label to free allocated memory and exit. ### Input Cleaning diff --git a/source/examples/selectors/easy_way.md b/source/examples/selectors/easy_way.md index 2db6591..1fc5592 100644 --- a/source/examples/selectors/easy_way.md +++ b/source/examples/selectors/easy_way.md @@ -10,7 +10,7 @@ The program begins with the inclusion of necessary headers from the Lexbor libra - **callback**: This function acts as a callback for serializing HTML nodes. It takes a pointer to data representing the node's content and its length, printing the content to the standard output. -- **find_callback**: This callback function is invoked for each matching node found by the CSS selectors. It increments the count of matched nodes, prints the count, and calls the serialization callback to output the node’s content. +- **find_callback**: This callback function is invoked for each matching node found by the CSS selectors. It increments the count of matched nodes, prints the count, and calls the serialization callback to output the node's content. ### Main Function Breakdown diff --git a/source/examples/selectors/unique_nodes.md b/source/examples/selectors/unique_nodes.md index 97f95dc..2165c99 100644 --- a/source/examples/selectors/unique_nodes.md +++ b/source/examples/selectors/unique_nodes.md @@ -30,7 +30,7 @@ The program outputs the serialized format of the HTML document using `lxb_html_s ### Finding HTML Nodes -The core functionality of this example is encapsulated in the `lxb_selectors_find()` function, which takes the selectors and attempts to match them against the nodes in the document’s body. A callback function, `find_callback`, is provided to handle each found node, incrementing a count and processing each matched node individually. If any part of this process fails, the program suitably returns an error status. +The core functionality of this example is encapsulated in the `lxb_selectors_find()` function, which takes the selectors and attempts to match them against the nodes in the document's body. A callback function, `find_callback`, is provided to handle each found node, incrementing a count and processing each matched node individually. If any part of this process fails, the program suitably returns an error status. ### Cleanup diff --git a/source/examples/styles/attribute_style.md b/source/examples/styles/attribute_style.md index 7c16042..8f04fb9 100644 --- a/source/examples/styles/attribute_style.md +++ b/source/examples/styles/attribute_style.md @@ -6,7 +6,7 @@ This article provides an in-depth explanation of a code example found in the [le ### Header Files and Function Definition -The code begins with necessary includes, specifically `base.h`, along with lexbor’s HTML and CSS header files. This setup ensures that all necessary functions related to HTML document handling and CSS processing are available. +The code begins with necessary includes, specifically `base.h`, along with lexbor's HTML and CSS header files. This setup ensures that all necessary functions related to HTML document handling and CSS processing are available. The `callback` function serves as a utility to print CSS property declarations. It takes a character pointer `data`, the length of data `len`, and a context pointer `ctx`. It uses `printf` to output the string, formatting it based on the provided length. This function is fundamental for logging purposes throughout the serialization process. diff --git a/source/examples/styles/events_insert.md b/source/examples/styles/events_insert.md index cc3bbe6..9b18371 100644 --- a/source/examples/styles/events_insert.md +++ b/source/examples/styles/events_insert.md @@ -4,7 +4,7 @@ This article explains the C code found in [lexbor/styles/events_insert.c](https: ## Overview -The provided code initializes an HTML document representation, parses a predefined HTML string, applies a CSS stylesheet, and manipulates the DOM to insert a new HTML element. Here’s a breakdown of the major sections of the code. +The provided code initializes an HTML document representation, parses a predefined HTML string, applies a CSS stylesheet, and manipulates the DOM to insert a new HTML element. Here's a breakdown of the major sections of the code. ## Code Breakdown diff --git a/source/index.md b/source/index.md index f607ac4..d762cdf 100644 --- a/source/index.md +++ b/source/index.md @@ -17,7 +17,7 @@ We build a web browser engine available as a software library; it ships under the Apache 2.0 license and has no extra dependencies. -# What’s the news? +# What's the news? Check out the latest version and our future roadmap [here](roadmap/) or follow our work on [GitHub](https://github.com/lexbor/lexbor). @@ -50,7 +50,7 @@ apps in a new environment? No problem. From the very start, we focus on availability and support in vastly different environments. -When you use lexbor, you need not worry about your app’s portability; from the +When you use lexbor, you need not worry about your app's portability; from the ground up, we designed lexbor for wide support across various computing platforms and IoT devices. diff --git a/spellcheck.yaml b/spellcheck.yaml new file mode 100644 index 0000000..397161e --- /dev/null +++ b/spellcheck.yaml @@ -0,0 +1,30 @@ +matrix: + - name: Markdown Source + sources: + - source/**/*.md + aspell: + lang: en + d: en_US + dictionary: + wordlists: + - .spellcheck_ignore.txt + output: spellcheck_dict.dic + pipeline: + - pyspelling.filters.context: + context_visible_first: true + escapes: '\\[\\`]' + delimiters: + - open: '(?s)^(?P *`{3,})' + close: ^(?P=open)$ + - open: '\]\(' + close: \) + - open: '[*]+' + close: '[*]+' + - open: '(?s)(?P[<])' + close: '[>]' + - open: (?P`+) + close: (?P=open) + - open: (?s)^(?P---)$ + close: ^(?P=open)$ + - pyspelling.filters.markdown: null + - pyspelling.filters.url: null diff --git a/spellcheck_dict.dic b/spellcheck_dict.dic new file mode 100644 index 0000000000000000000000000000000000000000..1fa05dcd195b2e696e881bdea065912281f63876 GIT binary patch literal 6224 zcmai&dyFLaQO9eir>AFkW_$KAd%Nd5`_ASZJDcQi9Bc%3g7@j&o5v0_eZKW2Ry;e~ zv)kEuU3bsz?#VLFgCs&C1eCx@F$716NF0gS{DBDyhy;|EoG1?>fe<7pJQN{YNO_?E zf=~Un*LMOTM(X|cuYOhat6x?9>Q}wZXyA7`XSe-!bJ&T`nvox#4g33@v#&q@`g`R6 z`Bj^r`ClB*LHXU;Bk~|6@?1vb@oAB-LMn_L(G@HnhbPVB@*Mxp`aK!!_IrLTm~c!z z!ue!4zbK+Q=2wsO%$G!L9hZ-E&R0aTIn)Y4ZUX+p`TTRe!C+2?f^^J^;3$-pHqbYPDSjvOA$NnRz&{QipcAp zO342-h?uC;%j`?HUdFzA6`6knSXmSK8Aa&#DI)NIBI~?S5q-a)2>$|DMbDBVa=c0v zxrY_uzo3Xcyh;^&Fj1<=Zzv+qlzDzp5jj|-1`{k&V*rcP7(5E<+?y3yddMFt zBL9RU@=q!v|CA!~Z&yU_9pEMOyi*bRcY()0jZKk9^X)c+RTDSOyPwxFiNdRB{^>sE z3H*T~>t0tRP|qn6h(9s7Cf_nxl7BJi$@vkT-;_5g;^4O^V%HM}tMV%b>vCNY`#xrH zQJy!rDxWi0mVY!@l^=qOB;4((qr85D!G^p^5eL=4RTAqlMFJBlvfg_PHsm+KwaTLW zo+1JLV@2e?sE8x~4%9qn)zy;x*kDy=GCKC;PDSjw zR}p)b3|3{sU_%~PWSy?TvK$(0$kU4W>AQ;chPORh z^7gX9lC%w0<(CcCcSYip8&fRH8H05R40`em=#h}$G}w?ways_p zNk#Vg5rZZ9eMRE)w+3tSJw*!Qj}&X3Sb30zX244nu!j^WG!26d>4N{)4-?&-wZFeF zqa*X;3BwD?La-Zzqxy?wFqp?uuX^Jsw4BWz}QXMf+lP zd1X~@%~BQ1jZIXzrwTzYh=XP)xYmq=elL!?>+xd*y~W1^c46&daTcu#{u1FIPEZXZu=MY*qwJ(XSS zcm1u<-^X8C`Q6F%Zs@lOIrYCvZ$Z3v`%&C$cKwLGn8>^1>D{>7!E$N{mhWugM(zdW z&g^s!2fLA+a=0Ld|Myeea+NRvYD#thnLa;;Yw8CrU^*si@) zY3}(lk>WtJ$16jB=#yAPk*kISvsA0GLA`Ep>YjfX>f~GsFS(A@iXsing>^>x)Y~GN zu}X9r?x@Yo{*IB~P33~w6TkdS3em_w3ZnebUYdo;&c%FJYsW_h2>a6wje({uSwu(KQY zaD(oC(&onG>{4mEhe_D<$h)cT6>O}OR>@#{W>lTr4mw2n48$635=&K@ty({fBVv>7 zHd_PpK9i}GNK-OydbAs~!hY0m$1-PCSiLxE!?%GMQ~J`6$WQ2A?I9?rIN0_F@h<+Z zXz3`}-M#G~l!B$VGx^$Wb~=5k8DplW7zdrWuV$7#E<$%|w97|JFlg@h#Gfl17k2$F z=~t<6PE>>6jrfdU0V3#5q`OhGr}B-eN0>3V*$R4bKiVY|DjW)qYo+lh_Kw4P7?IOl z{OX)-nX@Y%8&!{X{q3Mh=7jh{{T`UO?yB|Hb_m@A;;DXm(45(G+nTJ zs{yAoZ)x-36o6%!u#?*A+C3AfT#B3CEm#B6n8}g`mCUR@yiUi6s1AV(v_~nL3p~&p zcIA$AwL&0g+_9?GZK@{c-aRqayD_@M=0MXHP@SuKoCnPo1@y$IIooD%E@L>3)Eu)A z{OPe*c+^9*E_kRnz!P*YL@GV$^y+>@XmGqXGDNl)VS*0nXVT+#&kKED#zr`@dF#>I z3g=k^nn?{>T0#Z~a$A}fNAKEfZP+_B7YfIHaQvQ2(fPPj)*$rdwrs6lsuKm$`t-yg z?C*rlE|*o%B7Y7^`5GmWVphxys0LK**EqF$t==FdW+umLp^r$*k0PxU?#Xdf-b9#2 zx}mkE=r9qr$e-DaAyuinIi!`Q$BO{qPHxfxd4AfevGEi6?4TJ&e#nt@3(nDdL{-4{ zd@39gKyAQ~3EM5Ifp%e(sgM%Ng#k3l<$8%b*ePV{OUn!1f)rC!QS!9DwxYEq9q#%a znY8M9WQtDcADx+#83^X)oEuMQ$-P-j*UQ{0?#x)|cYKNzZ3LwY7h%+7Hsr$aLrD1) zJ)Y|K2Ymup*S$^HI%0Qe`aQ1RoKx>Nx4X@O?k-GT3nOr5Qd(*YsY_%}hDzmWL7p9r zhFei=O7cmE){w2S=!raJ4!+;wN;s5hOPfWCV&jpA(Q8FlHS=j4O~mcUKNw(eh7Qux z<2aRxI;}X=gGe)}hh@@<4!d+ldsGtbE8WS=f#1`+W+KhOr@l>Pmgo|wOGTSTQ%l7O zOdI%NyBWo5aILC^nF0|Ff*v(!FCtXhgL1y2kWi%?cu|i6!8Kip@tEein~tspzAmj^ zE@U+E`)!#{=`xNLu>gfA8gBRHUYmAxlO%V?8u4MrZ|wRMtK(5*M(*rbq;p53*^H53 znVH-TGG*1m!^UUoP<$5SBDCXR$(zN2x`qTs48_U!-03VeIBt^jBsE<= z6=up=-dOUq%1kN+rU@tRlQWc{T#7FM)xm9yUE~y+XLH*}$EX=o%?%l|?3w)XF_{ij zMQIYotz}w-yrt)Xa~yXEI>1Bp>&pO3)=cqnv@7NRY~0O!L}=L4v$^=DxKG*8O-4K0 z0rBCSO{MwHAnB&ESDU2ScF>ZOmVR>39b8&l@GfycrZW4x&6wupoPEjDf|CRi%@a_F0o_Fyr_hY_CmAuP${x9*( ztMB*oKL$k$n&KDXd4cbw5AltyV^tY3*7p~9|EGjc$1f#1^!+`282uT~EzIA|_nFf4 z_pxM6eb4b-^*FZZ`^Q%{_lB;$sV@g{}l5dX71bYZzp?tEWyoW?{#EU=LeDd9J-!P~UDR$~6!$h?ZVPap3Sf4VotLizO< zK}*5&>*za&ET7G?15N)2cp5!-vF;xrdy?_T6T3gkdlCBmtmh+FVSXK%??W%J=9kfT z3BSBAvHx6xKVZ&^J~bz{eRf@+sF?R-zfj{H~H+Hto^=Z{3Li9dF46& z{SE7%hyGURS6Szn{#kU6K)aIgKZGqG!L~QU{{(YOynhIr?q=Pq-~)+G8P?N(a$kpU z6~0e^s`Foo#Sf5qoagDp);FU2R^-kgKg#@9kX7CeDt;ANeZTqlt;81nAF Date: Sat, 28 Sep 2024 10:20:35 +0100 Subject: [PATCH 5/9] Formatted examples for readability. --- source/examples/css/StyleSheet.md | 50 ++++++++---- source/examples/css/index.md | 3 +- .../examples/css/selectors/list_easy_way.md | 51 +++++++++--- .../examples/css/selectors/list_fast_way.md | 62 +++++++++++---- source/examples/css/syntax/simple_colorize.md | 62 +++++++++++---- .../css/syntax/structure_parse_file.md | 74 ++++++++++++++---- .../css/syntax/tokenizer/chunks_stdin.md | 58 ++++++++++---- .../css/syntax/tokenizer/from_file.md | 53 +++++++++---- .../css/syntax/tokenizer/print_raw.md | 56 ++++++++++---- .../examples/encoding/buffer/decode/decode.md | 55 ++++++++++--- .../encoding/buffer/decode/decoder.md | 61 ++++++++++++--- .../encoding/buffer/decode/validate.md | 48 +++++++++--- .../examples/encoding/buffer/encode/encode.md | 42 +++++++--- .../encoding/buffer/encode/encoder.md | 63 +++++++++++---- .../encoding/buffer/encode/validate.md | 61 ++++++++++++--- source/examples/encoding/buffer/from_to.md | 54 ++++++++++--- source/examples/encoding/data_by_name.md | 47 ++++++++--- source/examples/encoding/index.md | 3 +- .../examples/encoding/single/decode/decode.md | 51 ++++++++---- .../encoding/single/decode/decoder.md | 48 ++++++++---- .../encoding/single/decode/validate.md | 43 +++++++++-- .../examples/encoding/single/encode/encode.md | 56 +++++++++----- .../encoding/single/encode/encoder.md | 46 ++++++++--- .../encoding/single/encode/validate.md | 57 ++++++++++---- source/examples/encoding/single/from_to.md | 67 ++++++++++++---- source/examples/html/document_parse.md | 55 +++++++++---- source/examples/html/document_parse_chunk.md | 55 ++++++++++--- source/examples/html/document_title.md | 60 +++++++++++---- source/examples/html/element_attributes.md | 51 ++++++++---- source/examples/html/element_create.md | 57 +++++++++----- source/examples/html/element_innerHTML.md | 46 ++++++++--- source/examples/html/elements_by_attr.md | 53 ++++++++----- .../examples/html/elements_by_class_name.md | 44 ++++++++--- source/examples/html/elements_by_tag_name.md | 53 +++++++++---- source/examples/html/encoding.md | 63 ++++++++++++--- source/examples/html/html2sexpr.md | 58 ++++++++++---- source/examples/html/index.md | 3 +- source/examples/html/parse.md | 48 ++++++++++-- source/examples/html/parse_chunk.md | 51 +++++++++--- source/examples/html/tokenizer/callback.md | 64 +++++++++++---- source/examples/html/tokenizer/simple.md | 58 ++++++++++---- .../examples/html/tokenizer/tag_attributes.md | 57 ++++++++++---- source/examples/html/tokenizer/text.md | 51 +++++++++--- source/examples/index.md | 3 +- source/examples/punycode/decode.md | 55 +++++++++---- source/examples/punycode/encode.md | 50 +++++++++--- source/examples/punycode/index.md | 3 +- source/examples/selectors/easy_way.md | 64 +++++++++++---- source/examples/selectors/index.md | 3 +- source/examples/selectors/normal_way.md | 65 ++++++++++++---- source/examples/selectors/unique_nodes.md | 64 ++++++++++++--- source/examples/styles/attribute_style.md | 64 +++++++++++---- source/examples/styles/events_insert.md | 64 ++++++++++----- source/examples/styles/index.md | 3 +- source/examples/styles/stylesheet.md | 60 +++++++++++---- source/examples/styles/walk.md | 77 +++++++++++++------ source/examples/unicode/idna_to_ascii.md | 69 +++++++++++++---- source/examples/unicode/index.md | 3 +- source/examples/unicode/normalization_form.md | 58 ++++++++++---- .../unicode/normalization_form_stdin.md | 51 +++++++++--- source/examples/url/index.md | 3 +- source/examples/url/parse.md | 56 ++++++++++---- source/examples/url/relative.md | 50 +++++++++--- 63 files changed, 2342 insertions(+), 721 deletions(-) diff --git a/source/examples/css/StyleSheet.md b/source/examples/css/StyleSheet.md index 75498ca..36a5728 100644 --- a/source/examples/css/StyleSheet.md +++ b/source/examples/css/StyleSheet.md @@ -1,16 +1,24 @@ # CSS Stylesheet Parsing Example -This article explains the example code within the file [lexbor/css/StyleSheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/StyleSheet.c), which demonstrates how to use the Lexbor library to read and parse a CSS stylesheet. The code showcases the steps required to initialize the parser, read the CSS data from a file, parse the stylesheet, and serialize the resulting object. +This article explains the example code within the file +[lexbor/css/StyleSheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/StyleSheet.c), +which demonstrates how to use the Lexbor library to read and parse a CSS +stylesheet. The code showcases the steps required to initialize the parser, read +the CSS data from a file, parse the stylesheet, and serialize the resulting +object. ## Code Breakdown ### Includes and Function Declaration -The code begins by including the necessary headers: `base.h` for foundational functionalities and `lexbor/core/fs.h` and `lexbor/css/css.h` for file system operations and CSS processing respectively. +The code begins by including the necessary headers: `base.h` for foundational +functionalities and `lexbor/core/fs.h` and `lexbor/css/css.h` for file system +operations and CSS processing respectively. ### Callback Function -A callback function is defined that takes a pointer to character data, its length, and a context pointer as parameters: +A callback function is defined that takes a pointer to character data, its +length, and a context pointer as parameters: ```c lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { @@ -19,11 +27,15 @@ lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { } ``` -This function will be used later to output the serialized CSS rules. It prints the data passed to it, formatted to handle the length of the string, ensuring that only the relevant part of the buffer is printed. +This function will be used later to output the serialized CSS rules. It prints +the data passed to it, formatted to handle the length of the string, ensuring +that only the relevant part of the buffer is printed. ### Main Function -The `main` function initializes the program and takes one argument: the path to a CSS file. It begins by checking if the number of arguments is correct and printing usage instructions if not: +The `main` function initializes the program and takes one argument: the path to +a CSS file. It begins by checking if the number of arguments is correct and +printing usage instructions if not: ```c if (argc != 2) { @@ -45,11 +57,14 @@ if (css == NULL) { } ``` -The `lexbor_fs_file_easy_read` function loads the file into the `css` buffer, and the length of the data is stored in `css_len`. If reading the file fails, an error message is displayed. +The `lexbor_fs_file_easy_read` function loads the file into the `css` buffer, +and the length of the data is stored in `css_len`. If reading the file fails, an +error message is displayed. ### Parsing the CSS -After successfully loading the CSS data, a CSS parser is created and initialized: +After successfully loading the CSS data, a CSS parser is created and +initialized: ```c parser = lxb_css_parser_create(); @@ -59,7 +74,8 @@ if (status != LXB_STATUS_OK) { } ``` -The parser initialization must succeed; otherwise, the program exits early with an error message. +The parser initialization must succeed; otherwise, the program exits early with +an error message. ### StyleSheet Parsing @@ -69,11 +85,13 @@ The actual parsing occurs with the following line: sst = lxb_css_stylesheet_parse(parser, css, css_len); ``` -Here, `lxb_css_stylesheet_parse` processes the loaded CSS content and generates a stylesheet object, `sst`. If parsing fails, the program will exit. +Here, `lxb_css_stylesheet_parse` processes the loaded CSS content and generates +a stylesheet object, `sst`. If parsing fails, the program will exit. ### Memory Management -Following the parsing step, memory for the CSS buffer is freed, and the parser is destroyed: +Following the parsing step, memory for the CSS buffer is freed, and the parser +is destroyed: ```c (void) lexbor_free(css); @@ -84,7 +102,8 @@ This cleanup is essential to avoid memory leaks in the application. ### Serializing the Output -The code then serializes the stylesheet and outputs the rules using the previously defined callback: +The code then serializes the stylesheet and outputs the rules using the +previously defined callback: ```c status = lxb_css_rule_serialize(sst->root, callback, NULL); @@ -93,7 +112,8 @@ if (status != LXB_STATUS_OK) { } ``` -This process invokes the callback for each rule in the stylesheet, allowing for customizable output handling. +This process invokes the callback for each rule in the stylesheet, allowing for +customizable output handling. ### Final Cleanup @@ -107,4 +127,8 @@ The program concludes successfully by returning `EXIT_SUCCESS`. ## Summary -In this example, a CSS file is read, parsed, and its contents serialized using the Lexbor library. Each significant section of the code has been explained to provide clarity on the parsing process and resource management. By following these steps, developers can incorporate CSS parsing capabilities into their applications using Lexbor. \ No newline at end of file +In this example, a CSS file is read, parsed, and its contents serialized using +the Lexbor library. Each significant section of the code has been explained to +provide clarity on the parsing process and resource management. By following +these steps, developers can incorporate CSS parsing capabilities into their +applications using Lexbor. \ No newline at end of file diff --git a/source/examples/css/index.md b/source/examples/css/index.md index 80abb10..dbdd504 100644 --- a/source/examples/css/index.md +++ b/source/examples/css/index.md @@ -1,6 +1,7 @@ # CSS Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 1 diff --git a/source/examples/css/selectors/list_easy_way.md b/source/examples/css/selectors/list_easy_way.md index 522c695..4fa238d 100644 --- a/source/examples/css/selectors/list_easy_way.md +++ b/source/examples/css/selectors/list_easy_way.md @@ -1,16 +1,26 @@ # CSS Selector Parsing Example -This article provides an in-depth explanation of the code found in `list_easy_way.c`, which demonstrates how to use the lexbor library for parsing CSS selectors. The code illustrates the steps involved in initializing a parser, parsing a CSS selector string, and handling the results and logs. +This article provides an in-depth explanation of the code found in +`list_easy_way.c`, which demonstrates how to use the lexbor library for parsing +CSS selectors. The code illustrates the steps involved in initializing a parser, +parsing a CSS selector string, and handling the results and logs. ## Code Overview -The example begins by including the necessary header file from the lexbor CSS library. The main purpose of this code is to showcase the parsing of a CSS selector string, specifically `:has(div, :not(as, 1%, .class), #hash)`, using the lexbor's CSS parser. +The example begins by including the necessary header file from the lexbor CSS +library. The main purpose of this code is to showcase the parsing of a CSS +selector string, specifically `:has(div, :not(as, 1%, .class), #hash)`, using +the lexbor's CSS parser. ## Key Sections of the Code ### Callback Function -The `callback` function is defined to handle output during the serialization process of the CSS selector list. It takes three parameters: a character pointer to the data, the length of that data, and a context pointer. Inside the function, the data is printed to the standard output using `printf`, formatted to respect the length provided. +The `callback` function is defined to handle output during the serialization +process of the CSS selector list. It takes three parameters: a character pointer +to the data, the length of that data, and a context pointer. Inside the +function, the data is printed to the standard output using `printf`, formatted +to respect the length provided. ```c lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { @@ -21,11 +31,16 @@ lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { ### Main Function -The `main` function begins by declaring variables for the parser and the selector list. It initializes the necessary constants for indentation used in log formatting and specifies the CSS selector string to be parsed. +The `main` function begins by declaring variables for the parser and the +selector list. It initializes the necessary constants for indentation used in +log formatting and specifies the CSS selector string to be parsed. #### Parser Initialization -A parser is created with `lxb_css_parser_create()`, and its initialization is performed with `lxb_css_parser_init()`. The code checks the return status of the initialization and exits gracefully if there is an issue, preventing further execution with an invalid parser instance. +A parser is created with `lxb_css_parser_create()`, and its initialization is +performed with `lxb_css_parser_init()`. The code checks the return status of the +initialization and exits gracefully if there is an issue, preventing further +execution with an invalid parser instance. ```c parser = lxb_css_parser_create(); @@ -37,7 +52,10 @@ if (status != LXB_STATUS_OK) { #### Parsing the Selector -The parsing of the CSS selector occurs with the function `lxb_css_selectors_parse()`, which takes the parser, the selector string, and its length as arguments. The status of the parser is checked afterward to ensure that the parsing was successful. +The parsing of the CSS selector occurs with the function +`lxb_css_selectors_parse()`, which takes the parser, the selector string, and +its length as arguments. The status of the parser is checked afterward to ensure +that the parsing was successful. ```c list = lxb_css_selectors_parse(parser, slctrs, @@ -50,7 +68,10 @@ if (parser->status != LXB_STATUS_OK) { #### Selector List Serialization -The parsed selector list is then serialized using `lxb_css_selector_serialize_list()`, which invokes the previously defined `callback` function. This outputs the result of the serialization to standard output. +The parsed selector list is then serialized using +`lxb_css_selector_serialize_list()`, which invokes the previously defined +`callback` function. This outputs the result of the serialization to standard +output. ```c (void) lxb_css_selector_serialize_list(list, callback, NULL); @@ -58,11 +79,16 @@ The parsed selector list is then serialized using `lxb_css_selector_serialize_li ### Handling Logs -If there are any logs generated during parsing, they are checked with `lxb_css_log_length()`, and the log is serialized in a similar manner, making use of the callback function and proper indentation for the displayed log. +If there are any logs generated during parsing, they are checked with +`lxb_css_log_length()`, and the log is serialized in a similar manner, making +use of the callback function and proper indentation for the displayed log. ### Cleanup -Finally, the example demonstrates proper resource management by destroying the parser and the associated memory. This is crucial in C programming to prevent memory leaks. The parser is destroyed first, followed by the cleanup of the selector list's memory. +Finally, the example demonstrates proper resource management by destroying the +parser and the associated memory. This is crucial in C programming to prevent +memory leaks. The parser is destroyed first, followed by the cleanup of the +selector list's memory. ```c (void) lxb_css_parser_destroy(parser, true); @@ -71,4 +97,9 @@ lxb_css_selector_list_destroy_memory(list); ## Conclusion -This example effectively showcases the functionality of the lexbor CSS library for parsing CSS selectors. From initializing the parser to handling logs and cleaning up memory, each step is crucial for ensuring that the program runs efficiently and correctly. The structured approach presented in the code promotes good practices in C programming, particularly regarding memory management and error handling. \ No newline at end of file +This example effectively showcases the functionality of the lexbor CSS library +for parsing CSS selectors. From initializing the parser to handling logs and +cleaning up memory, each step is crucial for ensuring that the program runs +efficiently and correctly. The structured approach presented in the code +promotes good practices in C programming, particularly regarding memory +management and error handling. \ No newline at end of file diff --git a/source/examples/css/selectors/list_fast_way.md b/source/examples/css/selectors/list_fast_way.md index f15671a..b4ec47b 100644 --- a/source/examples/css/selectors/list_fast_way.md +++ b/source/examples/css/selectors/list_fast_way.md @@ -1,49 +1,83 @@ # CSS Selectors Parsing Example -This article explains the functionality present in the `list_fast_way.c` source file of the lexbor CSS library, illustrating how to parse CSS selectors effectively. The primary goal of the code is to demonstrate the parsing of various CSS selectors and report the results, including any parsing warnings that may arise. +This article explains the functionality present in the `list_fast_way.c` source +file of the lexbor CSS library, illustrating how to parse CSS selectors +effectively. The primary goal of the code is to demonstrate the parsing of +various CSS selectors and report the results, including any parsing warnings +that may arise. ## Code Structure Overview -The entire program is structured around a single function `main`, which is the entry point when the program is executed. Several components of the code are critical to understanding how it prepares for and executes CSS selector parsing. +The entire program is structured around a single function `main`, which is the +entry point when the program is executed. Several components of the code are +critical to understanding how it prepares for and executes CSS selector parsing. ### Including Required Headers -The program begins by including `lexbor/css/css.h`, which is essential as it provides the functions, types, and structures required for working with the lexbor CSS parser. +The program begins by including `lexbor/css/css.h`, which is essential as it +provides the functions, types, and structures required for working with the +lexbor CSS parser. ### Callback Function -The `callback` function is defined to handle logging messages that arise during the CSS parsing process. It takes in data and its length, printing the message using `printf`. This function is a straightforward implementation that merely outputs the parsed messages but can be extended for more complex handlers if needed. +The `callback` function is defined to handle logging messages that arise during +the CSS parsing process. It takes in data and its length, printing the message +using `printf`. This function is a straightforward implementation that merely +outputs the parsed messages but can be extended for more complex handlers if +needed. ### Main Function Logic Inside the `main` function, the following key operations are performed: 1. **Memory Setup:** - - A memory object is created using `lxb_css_memory_create`, which acts as a buffer for all parsed structures. - - Initialization of memory is conducted with `lxb_css_memory_init`, setting aside an initial block of 128 bytes. + - A memory object is created using `lxb_css_memory_create`, which acts as a + buffer for all parsed structures. + - Initialization of memory is conducted with `lxb_css_memory_init`, setting + aside an initial block of 128 bytes. 2. **Parser Initialization:** - - A CSS parser object is instantiated with `lxb_css_parser_create` and initialized using `lxb_css_parser_init`. + - A CSS parser object is instantiated with `lxb_css_parser_create` and + initialized using `lxb_css_parser_init`. 3. **Binding the Memory and Selectors:** - - It is crucial to bind the created memory object to the parser to prevent memory allocation issues during parsing. This is achieved using `lxb_css_parser_memory_set`. - - Similarly, a selectors object is created and initialized. This object must also be bound to the parser, so its data can be managed correctly while parsing CSS selectors. + - It is crucial to bind the created memory object to the parser to prevent + memory allocation issues during parsing. This is achieved using + `lxb_css_parser_memory_set`. + - Similarly, a selectors object is created and initialized. This object must + also be bound to the parser, so its data can be managed correctly while + parsing CSS selectors. ### Parsing CSS Selectors -The program defines a static array of CSS selectors to be parsed. Each selector is processed in a loop, where: +The program defines a static array of CSS selectors to be parsed. Each selector +is processed in a loop, where: - The parser attempts to parse each selector using `lxb_css_selectors_parse`. -- The output is assessed to determine if parsing was successful or if there were any warnings or errors. Any issues are logged using the `callback` function, which provides informative feedback. +- The output is assessed to determine if parsing was successful or if there were + any warnings or errors. Any issues are logged using the `callback` function, + which provides informative feedback. ### Resource Cleanup -After all parsing operations, the program ensures to destroy the selectors and parser resources, calling `lxb_css_selectors_destroy` and `lxb_css_parser_destroy`. This step is crucial in managing memory and avoiding leaks in longer-running applications. +After all parsing operations, the program ensures to destroy the selectors and +parser resources, calling `lxb_css_selectors_destroy` and +`lxb_css_parser_destroy`. This step is crucial in managing memory and avoiding +leaks in longer-running applications. ### Serialization of Results -Finally, the parsed selector lists are serialized and printed. For each selector, the program checks if any parsing results were generated by the `lxb_css_selector_serialize_list` function. If a selector results in an empty list, it is noted accordingly. +Finally, the parsed selector lists are serialized and printed. For each +selector, the program checks if any parsing results were generated by the +`lxb_css_selector_serialize_list` function. If a selector results in an empty +list, it is noted accordingly. ### Conclusion -The `list_fast_way.c` example serves as a practical guide for developers looking to understand how to parse CSS selectors using the lexbor library. By emphasizing memory management, proper initialization, and error handling, this example lays a solid foundation for further applications of the library in real-world projects. The code harnesses the flexibility of lexbor while maintaining clarity and efficiency in parsing operations, making it an invaluable resource for CSS-related development. \ No newline at end of file +The `list_fast_way.c` example serves as a practical guide for developers looking +to understand how to parse CSS selectors using the lexbor library. By +emphasizing memory management, proper initialization, and error handling, this +example lays a solid foundation for further applications of the library in +real-world projects. The code harnesses the flexibility of lexbor while +maintaining clarity and efficiency in parsing operations, making it an +invaluable resource for CSS-related development. \ No newline at end of file diff --git a/source/examples/css/syntax/simple_colorize.md b/source/examples/css/syntax/simple_colorize.md index 86f30d3..4977966 100644 --- a/source/examples/css/syntax/simple_colorize.md +++ b/source/examples/css/syntax/simple_colorize.md @@ -1,61 +1,93 @@ # CSS Syntax Parsing Example -This article provides an explanation of a code example from the source file [lexbor/css/syntax/simple_colorize.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/simple_colorize.c). The code implements a simple CSS parser that reads a CSS file, parses its content, and provides color-coded output for each type of CSS rule and declaration using ANSI escape codes. +This article provides an explanation of a code example from the source file +[lexbor/css/syntax/simple_colorize.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/simple_colorize.c). +The code implements a simple CSS parser that reads a CSS file, parses its +content, and provides color-coded output for each type of CSS rule and +declaration using ANSI escape codes. ## Structure of the Program -The main function serves as the entry point of the program, where the user is expected to provide a CSS file as an argument. The program then reads this file into memory, initializes a CSS parser, and calls a function to parse the CSS content. +The main function serves as the entry point of the program, where the user is +expected to provide a CSS file as an argument. The program then reads this file +into memory, initializes a CSS parser, and calls a function to parse the CSS +content. ### Key Components 1. **Initialization and File Handling**: - The program checks for the correct number of command-line arguments. - - It leverages the `lexbor_fs_file_easy_read` function to read the CSS content from the specified file into a buffer. + - It leverages the `lexbor_fs_file_easy_read` function to read the CSS + content from the specified file into a buffer. 2. **CSS Parser Setup**: - It creates an instance of a CSS parser using `lxb_css_parser_create`. - The parser is then initialized with `lxb_css_parser_init`. 3. **CSS Parsing Function**: - - The function `css_parse` is called, which sets up the parsing context and starts the rule parsing process. + - The function `css_parse` is called, which sets up the parsing context and + starts the rule parsing process. 4. **Token Handling**: - - Several callback functions are defined to handle the various types of CSS syntax tokens, including qualified rules, at-rules, and declaration blocks. + - Several callback functions are defined to handle the various types of CSS + syntax tokens, including qualified rules, at-rules, and declaration blocks. ## Detailed Code Explanation ### CSS Parsing Function (`css_parse`) -The `css_parse` function initializes a context structure `css_ctx_t`, which tracks the current offset within the CSS data while parsing. It sets the parsing buffer using `lxb_css_parser_buffer_set` and begins the rule parsing using `lxb_css_syntax_parser_list_rules_push`. +The `css_parse` function initializes a context structure `css_ctx_t`, which +tracks the current offset within the CSS data while parsing. It sets the parsing +buffer using `lxb_css_parser_buffer_set` and begins the rule parsing using +`lxb_css_syntax_parser_list_rules_push`. -The call to `lxb_css_syntax_parser_run` runs the parser, which processes the CSS tokens based on the rules specified. This function returns a status that indicates whether the parsing succeeded or failed. +The call to `lxb_css_syntax_parser_run` runs the parser, which processes the CSS +tokens based on the rules specified. This function returns a status that +indicates whether the parsing succeeded or failed. ### Token Callbacks -The program defines various inline functions and callbacks to handle the output of tokens during parsing: +The program defines various inline functions and callbacks to handle the output +of tokens during parsing: -- **`css_print_token`** and **`css_print_token_offset`**: These functions print a CSS token along with proper formatting. They utilize ANSI escape codes to change text color in the console output for better visualization. +- **`css_print_token`** and **`css_print_token_offset`**: These functions print + a CSS token along with proper formatting. They utilize ANSI escape codes to + change text color in the console output for better visualization. ### Rule Handling The parser is equipped with callbacks for handling different CSS rules: -- **`css_list_rules_state`**: This function handles the state of list rules and is responsible for printing the state with a specific color. +- **`css_list_rules_state`**: This function handles the state of list rules and + is responsible for printing the state with a specific color. -- **`css_at_rule_state`** and **`css_at_rule_block`**: These handle at-rules and their blocks, printing the corresponding tokens and managing the nested structure of CSS. +- **`css_at_rule_state`** and **`css_at_rule_block`**: These handle at-rules and + their blocks, printing the corresponding tokens and managing the nested + structure of CSS. -- **`css_qualified_rule_state`** and **`css_qualified_rule_block`**: Manage the parsing of qualified rules and their associated declaration blocks, printing relevant information while maintaining contextual awareness of the current location within the CSS input. +- **`css_qualified_rule_state`** and **`css_qualified_rule_block`**: Manage the + parsing of qualified rules and their associated declaration blocks, printing + relevant information while maintaining contextual awareness of the current + location within the CSS input. ### Declarations Handling The parsing of declarations involves several parts: -- **`css_declarations_name`** and **`css_declarations_value`**: Handle the CSS property names and values, respectively, printing them in different colors to distinguish visually between different parts of declarations. +- **`css_declarations_name`** and **`css_declarations_value`**: Handle the CSS + property names and values, respectively, printing them in different colors to + distinguish visually between different parts of declarations. ### Memory Management -The code ensures to clean up the allocated memory for the CSS data buffer and parser instance by calling `lexbor_free` and `lxb_css_parser_destroy`, which prevents memory leaks. +The code ensures to clean up the allocated memory for the CSS data buffer and +parser instance by calling `lexbor_free` and `lxb_css_parser_destroy`, which +prevents memory leaks. ## Conclusion -This example illustrates how to implement a simple CSS parser that reads a file, processes its content into structured tokens, and outputs the result with visual cues. The use of callback functions and context structures allows for flexible and extendable parsing logic, suitable for more complex scenarios in CSS syntax processing. \ No newline at end of file +This example illustrates how to implement a simple CSS parser that reads a file, +processes its content into structured tokens, and outputs the result with visual +cues. The use of callback functions and context structures allows for flexible +and extendable parsing logic, suitable for more complex scenarios in CSS syntax +processing. \ No newline at end of file diff --git a/source/examples/css/syntax/structure_parse_file.md b/source/examples/css/syntax/structure_parse_file.md index 10efbfb..c24d38a 100644 --- a/source/examples/css/syntax/structure_parse_file.md +++ b/source/examples/css/syntax/structure_parse_file.md @@ -1,49 +1,89 @@ # CSS Syntax Parser Example -This article provides an overview of the code located in [lexbor/css/syntax/structure_parse_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/structure_parse_file.c), which implements a CSS syntax parser using the lexbor library. The primary goal of this code is to parse CSS syntax rules and declarations, handling various states and transitions within the parsing process. +This article provides an overview of the code located in +[lexbor/css/syntax/structure_parse_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/structure_parse_file.c), +which implements a CSS syntax parser using the lexbor library. The primary goal +of this code is to parse CSS syntax rules and declarations, handling various +states and transitions within the parsing process. ## Code Overview -The code starts with the inclusion of headers that bring in necessary definitions and functions from the lexbor library. It defines multiple functions and callback structures that manage the parsing of different CSS constructs. Central to the code is the `main` function, which serves as the entry point of the application. +The code starts with the inclusion of headers that bring in necessary +definitions and functions from the lexbor library. It defines multiple functions +and callback structures that manage the parsing of different CSS constructs. +Central to the code is the `main` function, which serves as the entry point of +the application. ### Main Function The `main` function performs several key operations: -1. **Argument Validation**: It checks if the number of command-line arguments is correct. If not, it prints usage instructions and exits the program. +1. **Argument Validation**: It checks if the number of command-line arguments is + correct. If not, it prints usage instructions and exits the program. -2. **File Reading**: It reads a CSS file specified by the user and stores its contents into a variable `css`. If this reading fails, the program exits with an error message. +2. **File Reading**: It reads a CSS file specified by the user and stores its + contents into a variable `css`. If this reading fails, the program exits with + an error message. -3. **Parser Initialization**: It creates and initializes a CSS parser instance. If the initialization fails, the program reports an error and exits. +3. **Parser Initialization**: It creates and initializes a CSS parser instance. + If the initialization fails, the program reports an error and exits. -4. **Parsing Execution**: The `css_parse` function is called with the parser and the CSS data to carry out the parsing process. +4. **Parsing Execution**: The `css_parse` function is called with the parser and + the CSS data to carry out the parsing process. -5. **Cleanup**: After the parsing is done, it releases allocated resources and exits with success or failure status based on the parsing outcome. +5. **Cleanup**: After the parsing is done, it releases allocated resources and + exits with success or failure status based on the parsing outcome. ### CSS Parsing Implementation -The `css_parse` function is crucial as it sets up the parsing buffer and pushes the initial parsing rules onto a stack. Here's a breakdown of its functionality: +The `css_parse` function is crucial as it sets up the parsing buffer and pushes +the initial parsing rules onto a stack. Here's a breakdown of its functionality: -- **Set Buffer**: The parsing buffer of the parser is set with the provided CSS data and its length. +- **Set Buffer**: The parsing buffer of the parser is set with the provided CSS + data and its length. -- **Push Rules**: The function uses the `lxb_css_syntax_parser_list_rules_push` to initiate the parsing of list rules, which is a fundamental construct in CSS. It expects a pointer to a set of callback functions that manage how the list of rules is processed. +- **Push Rules**: The function uses the `lxb_css_syntax_parser_list_rules_push` + to initiate the parsing of list rules, which is a fundamental construct in + CSS. It expects a pointer to a set of callback functions that manage how the + list of rules is processed. -- **Run Parser**: Finally, it triggers the parsing process with `lxb_css_syntax_parser_run`, which advances through the tokens available in the CSS data. +- **Run Parser**: Finally, it triggers the parsing process with + `lxb_css_syntax_parser_run`, which advances through the tokens available in + the CSS data. ### Callback Functions -The code defines a series of callback functions that manage specific CSS rules, states, and declarations: +The code defines a series of callback functions that manage specific CSS rules, +states, and declarations: -- **State Management**: Functions like `css_list_rules_state`, `css_at_rule_state`, and `css_declarations_name` handle specific parser states. Each of these functions typically logs the current processing step and processes tokens of interest. They return a success status after handling the tokens. +- **State Management**: Functions like `css_list_rules_state`, + `css_at_rule_state`, and `css_declarations_name` handle specific parser + states. Each of these functions typically logs the current processing step and + processes tokens of interest. They return a success status after handling the + tokens. -- **Handling Blocks**: Functions such as `css_at_rule_block` and `css_qualified_rule_block` manage blocks of CSS rules, utilizing the `css_consule_tokens` function to process tokens within those blocks. These functions also handle stack manipulations depending on the rule context, such as pushing or popping a stack. +- **Handling Blocks**: Functions such as `css_at_rule_block` and + `css_qualified_rule_block` manage blocks of CSS rules, utilizing the + `css_consule_tokens` function to process tokens within those blocks. These + functions also handle stack manipulations depending on the rule context, such + as pushing or popping a stack. -- **End States**: Functions like `css_list_rules_end` and `css_declarations_end` signal the completion of various sections. These may log end messages or perform any necessary cleanup. +- **End States**: Functions like `css_list_rules_end` and `css_declarations_end` + signal the completion of various sections. These may log end messages or + perform any necessary cleanup. ### Additional Utility Functions -The utility function `css_consule_tokens` is noteworthy. It iterates through tokens and processes each one sequentially, calling `lxb_css_syntax_token_serialize`, which presumably serializes or logs the token data. This function also handles token consumption, facilitating smooth progress through the parsing state. +The utility function `css_consule_tokens` is noteworthy. It iterates through +tokens and processes each one sequentially, calling +`lxb_css_syntax_token_serialize`, which presumably serializes or logs the token +data. This function also handles token consumption, facilitating smooth progress +through the parsing state. ### Conclusion -The code contained in `structure_parse_file.c` offers a comprehensive implementation of a CSS syntax parser with well-defined states and callbacks. The use of systematic error handling and resource management provides stability to the parsing process. By integrating these components, the lexbor library enhances its ability to interpret and manipulate CSS effectively. \ No newline at end of file +The code contained in `structure_parse_file.c` offers a comprehensive +implementation of a CSS syntax parser with well-defined states and callbacks. +The use of systematic error handling and resource management provides stability +to the parsing process. By integrating these components, the lexbor library +enhances its ability to interpret and manipulate CSS effectively. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/chunks_stdin.md b/source/examples/css/syntax/tokenizer/chunks_stdin.md index 146fa99..5f8cd80 100644 --- a/source/examples/css/syntax/tokenizer/chunks_stdin.md +++ b/source/examples/css/syntax/tokenizer/chunks_stdin.md @@ -1,20 +1,32 @@ # CSS Syntax Tokenizer Example -This article explains the implementation of a CSS syntax tokenizer in the file [lexbor/css/syntax/tokenizer/chunks_stdin.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/chunks_stdin.c). The code demonstrates how to read CSS data from standard input, tokenize it, and output the identified token types along with their serialized representations. +This article explains the implementation of a CSS syntax tokenizer in the file +[lexbor/css/syntax/tokenizer/chunks_stdin.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/chunks_stdin.c). +The code demonstrates how to read CSS data from standard input, tokenize it, and +output the identified token types along with their serialized representations. ## Overview -The main purpose of this example is to showcase the mechanics of the `lxb_css_syntax_tokenizer`, a component provided by the Lexbor library for parsing CSS syntax. The example leverages standard input (stdin) to read CSS input, processes the tokens through the tokenizer, and outputs details about each token to the console. +The main purpose of this example is to showcase the mechanics of the +`lxb_css_syntax_tokenizer`, a component provided by the Lexbor library for +parsing CSS syntax. The example leverages standard input (stdin) to read CSS +input, processes the tokens through the tokenizer, and outputs details about +each token to the console. ## Code Breakdown ### Includes and Definitions -At the beginning of the file, necessary headers are included, such as `lexbor/css/css.h`, which contains the definitions and interfaces for the CSS parser. A small buffer size of 32 bytes is defined with `#define BUFFER_SIZE 32`, which limits the amount of data read from stdin at one time, making it suitable for demonstration purposes. +At the beginning of the file, necessary headers are included, such as +`lexbor/css/css.h`, which contains the definitions and interfaces for the CSS +parser. A small buffer size of 32 bytes is defined with `#define BUFFER_SIZE +32`, which limits the amount of data read from stdin at one time, making it +suitable for demonstration purposes. ### Callback Function -The `callback` function is defined to handle the serialized output of the tokens: +The `callback` function is defined to handle the serialized output of the +tokens: ```c lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { @@ -23,11 +35,14 @@ lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { } ``` -This function prints the serialized token data to the console and returns a status indicating success. It serves as a simple mechanism to display token information during parsing. +This function prints the serialized token data to the console and returns a +status indicating success. It serves as a simple mechanism to display token +information during parsing. ### Chunk Callback Function -The `chunk_cb` function reads chunks of CSS data into a buffer and sets up the tokenizer to consume these chunks: +The `chunk_cb` function reads chunks of CSS data into a buffer and sets up the +tokenizer to consume these chunks: ```c lxb_status_t chunk_cb(lxb_css_syntax_tokenizer_t *tkz, const lxb_char_t **data, const lxb_char_t **end, void *ctx) { @@ -50,11 +65,16 @@ lxb_status_t chunk_cb(lxb_css_syntax_tokenizer_t *tkz, const lxb_char_t **data, } ``` -The function first attempts to read a buffer full of CSS data from stdin. If the end of input is reached, it marks the tokenizer's end-of-file (EOF) state. If an error occurs during reading, it returns a failure status. The function effectively prepares the data for the tokenizer by updating the pointed `data` and `end` pointers. +The function first attempts to read a buffer full of CSS data from stdin. If the +end of input is reached, it marks the tokenizer's end-of-file (EOF) state. If an +error occurs during reading, it returns a failure status. The function +effectively prepares the data for the tokenizer by updating the pointed `data` +and `end` pointers. ### Main Function -The `main` function orchestrates the initialization and the execution of the CSS syntax tokenizer: +The `main` function orchestrates the initialization and the execution of the CSS +syntax tokenizer: ```c int main(int argc, const char *argv[]) { @@ -75,7 +95,10 @@ int main(int argc, const char *argv[]) { lxb_css_syntax_tokenizer_chunk_cb_set(tkz, chunk_cb, inbuf); ``` -This section starts by creating and initializing a tokenizer instance. If initialization fails, it gracefully exits the process. Notably, it sets the chunk callback function, associating it with the previously defined `chunk_cb` and the input buffer `inbuf`. +This section starts by creating and initializing a tokenizer instance. If +initialization fails, it gracefully exits the process. Notably, it sets the +chunk callback function, associating it with the previously defined `chunk_cb` +and the input buffer `inbuf`. #### Token Processing Loop @@ -100,18 +123,27 @@ do { } while (type != LXB_CSS_SYNTAX_TOKEN__EOF); ``` -In this loop, it retrieves the next token from the tokenizer and checks for parsing errors. If a token is successfully obtained, it retrieves and prints the token's type name, serializes the token using the earlier defined `callback`, and then consumes the token to prepare for the next cycle. This loop continues until an EOF token is encountered. +In this loop, it retrieves the next token from the tokenizer and checks for +parsing errors. If a token is successfully obtained, it retrieves and prints the +token's type name, serializes the token using the earlier defined `callback`, +and then consumes the token to prepare for the next cycle. This loop continues +until an EOF token is encountered. ### Cleanup -At the end of the function, the tokenizer is destroyed to free up allocated resources: +At the end of the function, the tokenizer is destroyed to free up allocated +resources: ```c lxb_css_syntax_tokenizer_destroy(tkz); ``` -If any failures occur at various stages, the code ensures proper cleanup to avoid memory leaks. +If any failures occur at various stages, the code ensures proper cleanup to +avoid memory leaks. ## Conclusion -This example illustrates how to implement a simple CSS syntax tokenizer using the Lexbor library, allowing for parsing CSS input from stdin and outputting token information. Anyone looking to understand or extend CSS parsing functionality can use this code as a foundation for further development. \ No newline at end of file +This example illustrates how to implement a simple CSS syntax tokenizer using +the Lexbor library, allowing for parsing CSS input from stdin and outputting +token information. Anyone looking to understand or extend CSS parsing +functionality can use this code as a foundation for further development. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/from_file.md b/source/examples/css/syntax/tokenizer/from_file.md index e33c08c..6140a65 100644 --- a/source/examples/css/syntax/tokenizer/from_file.md +++ b/source/examples/css/syntax/tokenizer/from_file.md @@ -1,10 +1,17 @@ # CSS Syntax Tokenizer Example -This article provides a detailed explanation of a CSS syntax tokenizer implemented in the file [lexbor/css/syntax/tokenizer/from_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/from_file.c). The code serves the purpose of reading a CSS file, processing its contents to extract tokens, and producing output that describes each token. +This article provides a detailed explanation of a CSS syntax tokenizer +implemented in the file +[lexbor/css/syntax/tokenizer/from_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/from_file.c). +The code serves the purpose of reading a CSS file, processing its contents to +extract tokens, and producing output that describes each token. ## Overview -The main function of the tokenizer is to parse CSS code from a file, generate tokens for syntactic analysis, and then invoke a callback function to handle the output of each token. The program efficiently handles input and organizes the parsing process with the help of the lexbor library. +The main function of the tokenizer is to parse CSS code from a file, generate +tokens for syntactic analysis, and then invoke a callback function to handle the +output of each token. The program efficiently handles input and organizes the +parsing process with the help of the lexbor library. ## Code Breakdown @@ -17,7 +24,9 @@ At the beginning of the file, necessary libraries are included: #include ``` -The first include provides access to CSS-related functionality within the lexbor library, whereas the second includes core file system operations needed to read the CSS file. +The first include provides access to CSS-related functionality within the lexbor +library, whereas the second includes core file system operations needed to read +the CSS file. A utility function `usage` is defined to provide a simple usage instruction: @@ -28,11 +37,13 @@ static void usage(void) } ``` -This function prints an error message when the user does not provide the correct number of arguments. +This function prints an error message when the user does not provide the correct +number of arguments. ### Main Function Logic -The entry point of the program is the `main` function, which processes command-line arguments and orchestrates the tokenization process: +The entry point of the program is the `main` function, which processes +command-line arguments and orchestrates the tokenization process: ```c int main(int argc, const char *argv[]) @@ -40,7 +51,8 @@ int main(int argc, const char *argv[]) #### Argument Validation -At the start of the main function, the program checks whether exactly one command-line argument (the CSS file name) has been provided: +At the start of the main function, the program checks whether exactly one +command-line argument (the CSS file name) has been provided: ```c if (argc != 2) { @@ -62,7 +74,8 @@ if (css == NULL) { } ``` -The `lexbor_fs_file_easy_read` function reads the entire file into memory, and if it fails, the program reports the error and exits. +The `lexbor_fs_file_easy_read` function reads the entire file into memory, and +if it fails, the program reports the error and exits. #### Tokenizer Initialization @@ -73,11 +86,14 @@ tkz = lxb_css_syntax_tokenizer_create(); status = lxb_css_syntax_tokenizer_init(tkz); ``` -These lines allocate memory for the tokenizer and perform any necessary setup. If initialization fails, an error message is printed, and the program proceeds to cleanup. +These lines allocate memory for the tokenizer and perform any necessary setup. +If initialization fails, an error message is printed, and the program proceeds +to cleanup. #### Setting Input Buffer -Next, the contents of the CSS file are set as the input buffer for the tokenizer: +Next, the contents of the CSS file are set as the input buffer for the +tokenizer: ```c lxb_css_syntax_tokenizer_buffer_set(tkz, css, css_len); @@ -112,9 +128,15 @@ while (type != LXB_CSS_SYNTAX_TOKEN__EOF); #### Token Extraction -Within the loop, the function `lxb_css_syntax_token` retrieves a token. If no token is available, it reports a parsing failure. Upon successful token retrieval, it prints the type name of the token followed by calling `lxb_css_syntax_token_serialize`, which uses the provided `callback` function to output the token data. +Within the loop, the function `lxb_css_syntax_token` retrieves a token. If no +token is available, it reports a parsing failure. Upon successful token +retrieval, it prints the type name of the token followed by calling +`lxb_css_syntax_token_serialize`, which uses the provided `callback` function to +output the token data. -The type of the current token is acquired to determine if the end of the file (EOF) has been reached. If the EOF is not reached, the loop continues to consume tokens. +The type of the current token is acquired to determine if the end of the file +(EOF) has been reached. If the EOF is not reached, the loop continues to consume +tokens. ### Cleanup and Exit @@ -125,8 +147,13 @@ lxb_css_syntax_tokenizer_destroy(tkz); lexbor_free(css); ``` -Finally, the program returns `EXIT_SUCCESS` if the execution was successful, or `EXIT_FAILURE` in case of any errors during the process. +Finally, the program returns `EXIT_SUCCESS` if the execution was successful, or +`EXIT_FAILURE` in case of any errors during the process. ## Conclusion -The CSS syntax tokenizer effectively reads and parses a CSS file, extracting and displaying token details by utilizing the lexbor library's API for CSS processing. This example demonstrates not only the functionality of lexer-based parsing but also highlights memory management and error handling within a complex system. \ No newline at end of file +The CSS syntax tokenizer effectively reads and parses a CSS file, extracting and +displaying token details by utilizing the lexbor library's API for CSS +processing. This example demonstrates not only the functionality of lexer-based +parsing but also highlights memory management and error handling within a +complex system. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/print_raw.md b/source/examples/css/syntax/tokenizer/print_raw.md index 22c15df..f0e9799 100644 --- a/source/examples/css/syntax/tokenizer/print_raw.md +++ b/source/examples/css/syntax/tokenizer/print_raw.md @@ -1,12 +1,17 @@ # CSS Syntax Tokenizer Example -This article provides an overview of the `print_raw.c` source file, which implements a simple command-line tool for tokenizing CSS syntax using the Lexbor library. The primary purpose of this code is to read a CSS file, tokenize its contents, and print the tokens to the standard output. +This article provides an overview of the `print_raw.c` source file, which +implements a simple command-line tool for tokenizing CSS syntax using the Lexbor +library. The primary purpose of this code is to read a CSS file, tokenize its +contents, and print the tokens to the standard output. ## Breakdown of Major Code Sections ### Usage Function -The `usage` function is defined to inform users about how to execute the program properly. It outputs a simple message stating that the tool requires one argument, which is the name of the file to process: +The `usage` function is defined to inform users about how to execute the program +properly. It outputs a simple message stating that the tool requires one +argument, which is the name of the file to process: ```c static void @@ -16,11 +21,13 @@ usage(void) } ``` -This function is called when the number of command line arguments (`argv`) provided is incorrect. It helps to guide users in using the tool correctly. +This function is called when the number of command line arguments (`argv`) +provided is incorrect. It helps to guide users in using the tool correctly. ### Main Function Logic -The `main` function serves as the entry point of the program. It starts by checking if the user has provided exactly one argument: +The `main` function serves as the entry point of the program. It starts by +checking if the user has provided exactly one argument: ```c if (argc != 2) { @@ -29,11 +36,14 @@ if (argc != 2) { } ``` -If this condition is not met, the `usage` function is invoked to display the correct usage. The `FAILED` macro indicates an error state, although its definition is not shown in this excerpt. +If this condition is not met, the `usage` function is invoked to display the +correct usage. The `FAILED` macro indicates an error state, although its +definition is not shown in this excerpt. ### Reading the CSS File -The next step involves reading the CSS file specified by the user. The function `lexbor_fs_file_easy_read` attempts to read the file into memory: +The next step involves reading the CSS file specified by the user. The function +`lexbor_fs_file_easy_read` attempts to read the file into memory: ```c css = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &css_len); @@ -42,7 +52,8 @@ if (css == NULL) { } ``` -If the reading process fails, the program terminates by invoking the `FAILED` macro once again to report the issue. +If the reading process fails, the program terminates by invoking the `FAILED` +macro once again to report the issue. ### Tokenization Process @@ -53,9 +64,12 @@ tkz = lxb_css_syntax_tokenizer_create(); status = lxb_css_syntax_tokenizer_init(tkz); ``` -After creating the tokenizer, it is initialized with the `lxb_css_syntax_tokenizer_init` function. If the initialization does not succeed, an error message is printed, and the program enters the cleanup phase. +After creating the tokenizer, it is initialized with the +`lxb_css_syntax_tokenizer_init` function. If the initialization does not +succeed, an error message is printed, and the program enters the cleanup phase. -The following block of code sets the tokenizer's buffer to contain the CSS content read from the file: +The following block of code sets the tokenizer's buffer to contain the CSS +content read from the file: ```c tkz->with_comment = true; @@ -63,11 +77,13 @@ tkz->with_comment = true; lxb_css_syntax_tokenizer_buffer_set(tkz, css, css_len); ``` -The `with_comment` flag indicates whether comments should be included in the tokenization process. +The `with_comment` flag indicates whether comments should be included in the +tokenization process. ### Processing Tokens -The main loop of the `main` function processes the tokens generated by the tokenizer: +The main loop of the `main` function processes the tokens generated by the +tokenizer: ```c do { @@ -86,19 +102,29 @@ do { while (type != LXB_CSS_SYNTAX_TOKEN__EOF); ``` -Within this loop, a token is fetched, and if it cannot be retrieved, an error message is printed. The `colorize_cb` function is called to handle the output for each token. After processing the token, its type is checked, and it is consumed for the next iteration. +Within this loop, a token is fetched, and if it cannot be retrieved, an error +message is printed. The `colorize_cb` function is called to handle the output +for each token. After processing the token, its type is checked, and it is +consumed for the next iteration. ### Cleanup Phase -After all tokens have been processed, the program cleans up by destroying the tokenizer instance and freeing any allocated memory: +After all tokens have been processed, the program cleans up by destroying the +tokenizer instance and freeing any allocated memory: ```c lxb_css_syntax_tokenizer_destroy(tkz); lexbor_free(css); ``` -Finally, if no errors occurred during processing, the program returns `EXIT_SUCCESS`. In case of failure, it follows a similar cleanup procedure but returns `EXIT_FAILURE`. +Finally, if no errors occurred during processing, the program returns +`EXIT_SUCCESS`. In case of failure, it follows a similar cleanup procedure but +returns `EXIT_FAILURE`. ## Conclusion -The `print_raw.c` implementation demonstrates how to leverage the Lexbor library for CSS syntax tokenization. By following a structured approach, it effectively reads CSS content, processes it into tokens, and provides robust error handling. This example serves as a foundation for further exploration of CSS parsing and analysis using Lexbor. \ No newline at end of file +The `print_raw.c` implementation demonstrates how to leverage the Lexbor library +for CSS syntax tokenization. By following a structured approach, it effectively +reads CSS content, processes it into tokens, and provides robust error handling. +This example serves as a foundation for further exploration of CSS parsing and +analysis using Lexbor. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decode.md b/source/examples/encoding/buffer/decode/decode.md index 792ac9f..286b0ea 100644 --- a/source/examples/encoding/buffer/decode/decode.md +++ b/source/examples/encoding/buffer/decode/decode.md @@ -1,45 +1,76 @@ # UTF-8 Decoding Example -In this article, we will explore a code example from the file [lexbor/encoding/buffer/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decode.c) that demonstrates how to decode a UTF-8 encoded string into code points using the Lexbor library. This example specifically highlights the usage of Lexbor's encoding functionalities, providing insights into how to leverage these features for character decoding in C. +In this article, we will explore a code example from the file +[lexbor/encoding/buffer/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decode.c) +that demonstrates how to decode a UTF-8 encoded string into code points using +the Lexbor library. This example specifically highlights the usage of Lexbor's +encoding functionalities, providing insights into how to leverage these features +for character decoding in C. ## Code Explanation -The code begins by including the necessary header files. It specifically includes `lexbor/encoding/encoding.h`, which contains the declarations needed for encoding and decoding operations. The definition of the `FAILED` macro is also provided, which facilitates error handling by printing an error message to `stderr` and terminating the program if an error occurs. +The code begins by including the necessary header files. It specifically +includes `lexbor/encoding/encoding.h`, which contains the declarations needed +for encoding and decoding operations. The definition of the `FAILED` macro is +also provided, which facilitates error handling by printing an error message to +`stderr` and terminating the program if an error occurs. ### Main Function -The `main` function serves as the entry point of our program, where we will set up the decoding of a UTF-8 encoded string. +The `main` function serves as the entry point of our program, where we will set +up the decoding of a UTF-8 encoded string. #### Variable Declarations Within the `main` function, several important variables are declared: - `buf_length`: To store the length of the decoded buffer. -- `status`: To hold the status of operations, indicated by the `lxb_status_t` type. +- `status`: To hold the status of operations, indicated by the `lxb_status_t` + type. - `cp`: An array of `lxb_codepoint_t` to hold the decoded code points. -- `decode`: An instance of `lxb_encoding_decode_t`, which manages the decoding process. +- `decode`: An instance of `lxb_encoding_decode_t`, which manages the decoding + process. - `encoding`: A pointer to the encoding data. -Next, we prepare the buffer that contains the UTF-8 string "Привет, мир!" (which translates to "Hello, World!"). The buffer is defined as `data`, and `end` is set to point to the end of the string using `strlen`. +Next, we prepare the buffer that contains the UTF-8 string "Привет, мир!" (which +translates to "Hello, World!"). The buffer is defined as `data`, and `end` is +set to point to the end of the string using `strlen`. #### Initialization -The initialization process is crucial for setting up the decoder. We call `lxb_encoding_data(LXB_ENCODING_UTF_8)` to get the encoding data for UTF-8. Then, we initialize the decoder using `lxb_encoding_decode_init`, passing the decoder instance, encoding, the code point array, and its capacity. +The initialization process is crucial for setting up the decoder. We call +`lxb_encoding_data(LXB_ENCODING_UTF_8)` to get the encoding data for UTF-8. +Then, we initialize the decoder using `lxb_encoding_decode_init`, passing the +decoder instance, encoding, the code point array, and its capacity. -If this initialization fails, the `FAILED` macro is triggered, notifying us with an error message and stopping the program. +If this initialization fails, the `FAILED` macro is triggered, notifying us with +an error message and stopping the program. #### Decoding Process -After successful initialization, we print the original UTF-8 string to the console. The actual decoding is carried out by calling the `decode` function through the `encoding` pointer. The function decodes the string pointed to by `data` up to its `end`, storing the results in the `cp` array. +After successful initialization, we print the original UTF-8 string to the +console. The actual decoding is carried out by calling the `decode` function +through the `encoding` pointer. The function decodes the string pointed to by +`data` up to its `end`, storing the results in the `cp` array. -In this context, an error during decoding is not expected. Therefore, the code contains a comment indicating that such a situation cannot occur in this example, underlining the robustness of the decoding function for the given input. +In this context, an error during decoding is not expected. Therefore, the code +contains a comment indicating that such a situation cannot occur in this +example, underlining the robustness of the decoding function for the given +input. #### Output and Conclusion -Finally, we calculate the length of the buffer used in the decoding process with `lxb_encoding_decode_buf_used(&decode)` and print each decoded code point in hexadecimal format. +Finally, we calculate the length of the buffer used in the decoding process with +`lxb_encoding_decode_buf_used(&decode)` and print each decoded code point in +hexadecimal format. The program concludes with a return statement indicating successful execution. ## Summary -This example effectively illustrates how to decode a UTF-8 string into individual code points using the Lexbor library. It emphasizes the initialization of the decoding context, error handling strategies, and the process of translating encoded UTF-8 data into usable character representations. Through careful management of buffers and decoding functions, developers can build robust applications that accurately handle multi-byte character sets. \ No newline at end of file +This example effectively illustrates how to decode a UTF-8 string into +individual code points using the Lexbor library. It emphasizes the +initialization of the decoding context, error handling strategies, and the +process of translating encoded UTF-8 data into usable character representations. +Through careful management of buffers and decoding functions, developers can +build robust applications that accurately handle multi-byte character sets. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decoder.md b/source/examples/encoding/buffer/decode/decoder.md index 4463576..9ae8765 100644 --- a/source/examples/encoding/buffer/decode/decoder.md +++ b/source/examples/encoding/buffer/decode/decoder.md @@ -1,47 +1,84 @@ # Unicode Decoder Example -In this article, we will discuss a simple Unicode decoder implemented in C, specifically within the context of the lexbor library. The code can be found in the source file [lexbor/encoding/buffer/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decoder.c). This program is designed to take a specified character encoding from the command line, read input data, and decode it into Unicode code points, displaying the result in a format suitable for further processing or representation. +In this article, we will discuss a simple Unicode decoder implemented in C, +specifically within the context of the lexbor library. The code can be found in +the source file +[lexbor/encoding/buffer/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decoder.c). +This program is designed to take a specified character encoding from the command +line, read input data, and decode it into Unicode code points, displaying the +result in a format suitable for further processing or representation. ## Code Structure Overview -The code begins with the necessary includes, defines, and utility functions required for the decoder's operation. Key components include error handling, usage instructions, and the main decoding loop. +The code begins with the necessary includes, defines, and utility functions +required for the decoder's operation. Key components include error handling, +usage instructions, and the main decoding loop. ### Error Handling Macro -The `FAILED` macro is defined to streamline error reporting throughout the code. It takes a boolean indicating if usage should be displayed, followed by a formatted message. If an error occurs, this macro outputs the error message to standard error and, if requested, invokes the `usage()` function to display acceptable encoding options. +The `FAILED` macro is defined to streamline error reporting throughout the code. +It takes a boolean indicating if usage should be displayed, followed by a +formatted message. If an error occurs, this macro outputs the error message to +standard error and, if requested, invokes the `usage()` function to display +acceptable encoding options. ### Usage Function -The `usage` function is a simple utility that displays how the program should be invoked and lists the character encodings that the decoder supports. This function becomes crucial when the user fails to provide the expected arguments. +The `usage` function is a simple utility that displays how the program should be +invoked and lists the character encodings that the decoder supports. This +function becomes crucial when the user fails to provide the expected arguments. ### Main Function Logic -The `main` function serves as the entry point of the application. It handles argument parsing, encoding determination, and the initialization of the decoding process. +The `main` function serves as the entry point of the application. It handles +argument parsing, encoding determination, and the initialization of the decoding +process. #### Argument Parsing -The program checks if exactly one argument (the encoding name) has been provided. If not, it calls the `usage()` function and exits gracefully. +The program checks if exactly one argument (the encoding name) has been +provided. If not, it calls the `usage()` function and exits gracefully. #### Encoding Retrieval -Next, it uses the `lxb_encoding_data_by_pre_name` function to retrieve the encoding data based on the provided encoding name. If the encoding cannot be determined, the `FAILED` macro is invoked with appropriate error handling. +Next, it uses the `lxb_encoding_data_by_pre_name` function to retrieve the +encoding data based on the provided encoding name. If the encoding cannot be +determined, the `FAILED` macro is invoked with appropriate error handling. #### Decoder Initialization -Once the encoding is acquired, the decoder is initialized using `lxb_encoding_decode_init`. It also sets up a buffer for any replacement characters that may need to be utilized during the decoding process. Each initialization step includes error checking to ensure the decoder is prepared for processing the input data. +Once the encoding is acquired, the decoder is initialized using +`lxb_encoding_decode_init`. It also sets up a buffer for any replacement +characters that may need to be utilized during the decoding process. Each +initialization step includes error checking to ensure the decoder is prepared +for processing the input data. ### Decoding Loop -The main decoding operation occurs within a loop that reads data from standard input. The program continuously reads chunks of data into a buffer (`inbuf`) until the end of the input is reached. +The main decoding operation occurs within a loop that reads data from standard +input. The program continuously reads chunks of data into a buffer (`inbuf`) +until the end of the input is reached. #### Buffer Processing -For each chunk of data read, the program decodes the input using the encoding's decode function. It iterates over the decoded results, determining whether each code point is an ASCII character or a Unicode character. The output format uses a hexadecimal representation for both types of characters, with Unicode points prefixed by `\u` and ASCII points by `\x`. +For each chunk of data read, the program decodes the input using the encoding's +decode function. It iterates over the decoded results, determining whether each +code point is an ASCII character or a Unicode character. The output format uses +a hexadecimal representation for both types of characters, with Unicode points +prefixed by `\u` and ASCII points by `\x`. #### Finalizing Decoding -After all input data has been processed, the decoder's `finish` function is called. This function ensures that any remaining code points, particularly those that could not be fully processed, are correctly handled. The remaining code points are then printed if any exist in the output buffer. +After all input data has been processed, the decoder's `finish` function is +called. This function ensures that any remaining code points, particularly those +that could not be fully processed, are correctly handled. The remaining code +points are then printed if any exist in the output buffer. ## Conclusion -This `decoder.c` example illustrates the practical use of the lexbor library for handling various character encodings and converting them into a clear, usable form. By leveraging the available utility functions and error handling methods, the code provides a robust framework for decoding inputs in a specified encoding, making it valuable for any application that requires processing text in diverse formats. \ No newline at end of file +This `decoder.c` example illustrates the practical use of the lexbor library for +handling various character encodings and converting them into a clear, usable +form. By leveraging the available utility functions and error handling methods, +the code provides a robust framework for decoding inputs in a specified +encoding, making it valuable for any application that requires processing text +in diverse formats. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/validate.md b/source/examples/encoding/buffer/decode/validate.md index 74f07f8..7c46088 100644 --- a/source/examples/encoding/buffer/decode/validate.md +++ b/source/examples/encoding/buffer/decode/validate.md @@ -1,22 +1,29 @@ # UTF-8 Decoding and Replacement Example -This article will explain a C code example that demonstrates UTF-8 decoding and the handling of invalid byte sequences using the lexbor library. The source file for the example is [lexbor/encoding/buffer/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/validate.c). +This article will explain a C code example that demonstrates UTF-8 decoding and +the handling of invalid byte sequences using the lexbor library. The source file +for the example is +[lexbor/encoding/buffer/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/validate.c). ## Overview -The provided code illustrates how to initialize a decoder for UTF-8 encoded strings and replace any invalid byte sequences with specified replacement code points. This is accomplished utilizing the lexbor encoding API. +The provided code illustrates how to initialize a decoder for UTF-8 encoded +strings and replace any invalid byte sequences with specified replacement code +points. This is accomplished utilizing the lexbor encoding API. ## Code Breakdown ### Including Necessary Headers -At the start of the code, the relevant header file from the lexbor library is included: +At the start of the code, the relevant header file from the lexbor library is +included: ```c #include ``` -This inclusion is necessary as it provides the required declarations and definitions for encoding operations performed later in the code. +This inclusion is necessary as it provides the required declarations and +definitions for encoding operations performed later in the code. ### Defining a Macro for Error Handling @@ -33,11 +40,14 @@ A macro named `FAILED` is defined to handle errors gracefully: while (0) ``` -This macro uses `fprintf` to print error messages to standard error and then exits the program with `EXIT_FAILURE`. It helps streamline error reporting throughout the code. +This macro uses `fprintf` to print error messages to standard error and then +exits the program with `EXIT_FAILURE`. It helps streamline error reporting +throughout the code. ### Main Function and Buffer Preparation -The main function initializes several variables, including a buffer for decoded code points and an instance of the decoder: +The main function initializes several variables, including a buffer for decoded +code points and an instance of the decoder: ```c int main(int argc, const char *argv[]) { @@ -51,7 +61,9 @@ int main(int argc, const char *argv[]) { const lxb_char_t *end = data + strlen((char *) data); ``` -In this segment, a buffer `cp` is defined to hold up to 32 decoded code points. The `data` variable contains a UTF-8 string that includes an invalid byte (`\x80`). The `end` variable calculates the pointer to the end of the `data`. +In this segment, a buffer `cp` is defined to hold up to 32 decoded code points. +The `data` variable contains a UTF-8 string that includes an invalid byte +(`\x80`). The `end` variable calculates the pointer to the end of the `data`. ### Initializing the Decoder @@ -66,7 +78,10 @@ if (status != LXB_STATUS_OK) { } ``` -Here, `lxb_encoding_data` retrieves the encoding data for UTF-8. The `lxb_encoding_decode_init` function sets up the decoder with the encoding information and the previously defined buffer for decoded code points. If initialization fails, the `FAILED` macro is invoked. +Here, `lxb_encoding_data` retrieves the encoding data for UTF-8. The +`lxb_encoding_decode_init` function sets up the decoder with the encoding +information and the previously defined buffer for decoded code points. If +initialization fails, the `FAILED` macro is invoked. ### Configuring Replacement Settings @@ -80,7 +95,9 @@ if (status != LXB_STATUS_OK) { } ``` -This step allows the decoder to specify how to handle invalid sequences by using the replacement character defined in lexbor. Again, the error handling is consistent throughout. +This step allows the decoder to specify how to handle invalid sequences by using +the replacement character defined in lexbor. Again, the error handling is +consistent throughout. ### Decoding the Input String @@ -93,7 +110,9 @@ if (status != LXB_STATUS_OK) { } ``` -This line invokes the decoding process, moving through the input string from `data` to `end`. The decoder attempts to handle any valid sequences and replaces any invalid sequences as configured earlier. +This line invokes the decoding process, moving through the input string from +`data` to `end`. The decoder attempts to handle any valid sequences and replaces +any invalid sequences as configured earlier. ### Outputting the Decoded Values @@ -107,8 +126,13 @@ for (size_t i = 0; i < buf_length; i++) { } ``` -Here, `lxb_encoding_decode_buf_used` retrieves the number of valid code points decoded. Then, a loop iterates over each code point in the buffer, printing the hexadecimal representation. +Here, `lxb_encoding_decode_buf_used` retrieves the number of valid code points +decoded. Then, a loop iterates over each code point in the buffer, printing the +hexadecimal representation. ## Conclusion -This example effectively showcases the use of the lexbor library for decoding UTF-8 strings while managing potentially invalid byte sequences. By initializing the decoder, setting up replacement strategies, and decoding the input string, the program demonstrates a robust method for handling encoding issues in C. \ No newline at end of file +This example effectively showcases the use of the lexbor library for decoding +UTF-8 strings while managing potentially invalid byte sequences. By initializing +the decoder, setting up replacement strategies, and decoding the input string, +the program demonstrates a robust method for handling encoding issues in C. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encode.md b/source/examples/encoding/buffer/encode/encode.md index fe02f7c..cbf9bbc 100644 --- a/source/examples/encoding/buffer/encode/encode.md +++ b/source/examples/encoding/buffer/encode/encode.md @@ -1,16 +1,24 @@ # Encoding Unicode Code Points to UTF-8 Example -This article explains the encoding of Unicode code points to a UTF-8 byte string using the Lexbor library. The source code is located in [lexbor/encoding/buffer/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/encode.c). This example demonstrates how to initialize the encoder, encode Unicode code points, and handle the output appropriately. +This article explains the encoding of Unicode code points to a UTF-8 byte string +using the Lexbor library. The source code is located in +[lexbor/encoding/buffer/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/encode.c). +This example demonstrates how to initialize the encoder, encode Unicode code +points, and handle the output appropriately. ## Overview -The primary purpose of this code is to convert an array of Unicode code points into a UTF-8 encoded string. The code includes error handling, memory allocation for the output buffer, and final output printing. +The primary purpose of this code is to convert an array of Unicode code points +into a UTF-8 encoded string. The code includes error handling, memory allocation +for the output buffer, and final output printing. ## Code Explanation ### Includes and Macros -The code begins with the inclusion of the `lexbor/encoding/encoding.h` header file, which provides necessary functions and definitions for encoding operations. A macro called `FAILED` is defined to handle error reporting: +The code begins with the inclusion of the `lexbor/encoding/encoding.h` header +file, which provides necessary functions and definitions for encoding +operations. A macro called `FAILED` is defined to handle error reporting: ```c #define FAILED(...) \ @@ -23,11 +31,13 @@ The code begins with the inclusion of the `lexbor/encoding/encoding.h` header fi while (0) ``` -This macro simplifies the error handling by printing an error message to `stderr` and exiting the program if there is a failure during initialization. +This macro simplifies the error handling by printing an error message to +`stderr` and exiting the program if there is a failure during initialization. ### Main Function -The `main` function initializes several variables and prepares to encode the Unicode code points: +The `main` function initializes several variables and prepares to encode the +Unicode code points: ```c int main(int argc, const char *argv[]) @@ -41,11 +51,14 @@ int main(int argc, const char *argv[]) lxb_char_t buffer[1024]; ``` -In this section, a buffer of 1024 characters is created to hold the encoded byte string. The `lxb_codepoint_t` array contains several predefined Unicode code points. +In this section, a buffer of 1024 characters is created to hold the encoded byte +string. The `lxb_codepoint_t` array contains several predefined Unicode code +points. ### Unicode Code Points -The code points initialized in the `cps` array represent Cyrillic characters and symbols: +The code points initialized in the `cps` array represent Cyrillic characters and +symbols: ```c lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, @@ -64,7 +77,9 @@ if (status != LXB_STATUS_OK) { } ``` -Here, `lxb_encoding_data` retrieves encoding information for UTF-8, and `lxb_encoding_encode_init` initializes the encoding context. If the initialization fails, the `FAILED` macro is invoked. +Here, `lxb_encoding_data` retrieves encoding information for UTF-8, and +`lxb_encoding_encode_init` initializes the encoding context. If the +initialization fails, the `FAILED` macro is invoked. ### Encoding Process @@ -77,7 +92,8 @@ if (status != LXB_STATUS_OK) { } ``` -This line calls the `encode` function from the `encoding` structure, which encodes the code points from `cps_ref` to `cps_end`. +This line calls the `encode` function from the `encoding` structure, which +encodes the code points from `cps_ref` to `cps_end`. ### Output Preparation @@ -95,8 +111,12 @@ Finally, the result is displayed: printf("\nResult: %s\n", (char *) buffer); ``` -This prints the encoded UTF-8 string to standard output along with the original Unicode values shown in hexadecimal format. +This prints the encoded UTF-8 string to standard output along with the original +Unicode values shown in hexadecimal format. ## Conclusion -This code example effectively demonstrates the usage of the Lexbor encoding library for converting Unicode code points to a UTF-8 encoded string. It emphasizes proper initialization, error handling, and output formatting, which are essential for working with character encoding in C programming. \ No newline at end of file +This code example effectively demonstrates the usage of the Lexbor encoding +library for converting Unicode code points to a UTF-8 encoded string. It +emphasizes proper initialization, error handling, and output formatting, which +are essential for working with character encoding in C programming. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encoder.md b/source/examples/encoding/buffer/encode/encoder.md index a0db86d..98c42c1 100644 --- a/source/examples/encoding/buffer/encode/encoder.md +++ b/source/examples/encoding/buffer/encode/encoder.md @@ -1,12 +1,18 @@ # Encoder Example -This article provides an explanation of the `encoder.c` source file located in the `lexbor/encoding/buffer/encode` directory. The intent of the code is to implement a command-line utility that encodes input data based on the specified character encoding name. The encoder processes Standard Input, converts it based on escape sequences into code points, and outputs the encoded data to Standard Output. +This article provides an explanation of the `encoder.c` source file located in +the `lexbor/encoding/buffer/encode` directory. The intent of the code is to +implement a command-line utility that encodes input data based on the specified +character encoding name. The encoder processes Standard Input, converts it based +on escape sequences into code points, and outputs the encoded data to Standard +Output. ## Code Structure and Major Sections ### Header and Includes -At the beginning of the file, there are several include statements that bring in necessary libraries: +At the beginning of the file, there are several include statements that bring in +necessary libraries: ```c #include @@ -15,11 +21,14 @@ At the beginning of the file, there are several include statements that bring in #include ``` -These headers allow access to string manipulation functions, standard input/output functionalities, and the defined encoding structures and functions within the `lexbor` library. +These headers allow access to string manipulation functions, standard +input/output functionalities, and the defined encoding structures and functions +within the `lexbor` library. ### Error Handling -The `FAILED` macro is defined to streamline error handling within the code. It prints an error message and usage instructions when an issue occurs: +The `FAILED` macro is defined to streamline error handling within the code. It +prints an error message and usage instructions when an issue occurs: ```c #define FAILED(with_usage, ...) \ @@ -36,11 +45,15 @@ The `FAILED` macro is defined to streamline error handling within the code. It p while (0) ``` -This macro takes a boolean flag to determine if usage instructions should be displayed before exiting. This ensures that any critical failures can inform users about incorrect command usage. +This macro takes a boolean flag to determine if usage instructions should be +displayed before exiting. This ensures that any critical failures can inform +users about incorrect command usage. ### Usage Function -The `usage` function provides a simple guide on how to run the encoder, listing available encodings. It helps users understand the valid options to include when calling the program: +The `usage` function provides a simple guide on how to run the encoder, listing +available encodings. It helps users understand the valid options to include when +calling the program: ```c static void usage(void) @@ -53,11 +66,14 @@ static void usage(void) ### Main Function -The `main` function is the core of the program, where execution begins. It handles command-line arguments, initializes encoding setups, reads from Standard Input, and writes the encoded data to Standard Output. +The `main` function is the core of the program, where execution begins. It +handles command-line arguments, initializes encoding setups, reads from Standard +Input, and writes the encoded data to Standard Output. #### Command-Line Argument Handling -The program expects one argument - the encoding name. If this is not provided, the `usage` function is invoked: +The program expects one argument - the encoding name. If this is not provided, +the `usage` function is invoked: ```c if (argc != 2) { @@ -68,7 +84,9 @@ if (argc != 2) { #### Encoding Initialization -The encoding is determined using the `lxb_encoding_data_by_pre_name` function, which fetches the encoding data associated with the provided name. If it fails, it reports an error: +The encoding is determined using the `lxb_encoding_data_by_pre_name` function, +which fetches the encoding data associated with the provided name. If it fails, +it reports an error: ```c encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); @@ -77,7 +95,8 @@ if (encoding == NULL) { } ``` -After determining the encoding, the encoder is initialized with `lxb_encoding_encode_init`: +After determining the encoding, the encoder is initialized with +`lxb_encoding_encode_init`: ```c status = lxb_encoding_encode_init(&encode, encoding, outbuf, sizeof(outbuf)); @@ -90,7 +109,8 @@ This sets up a buffer for output based on the specified encoding type. ### Data Encoding Loop -The heart of the encoding process is found in a `do-while` loop that reads from stdin and encodes the input data: +The heart of the encoding process is found in a `do-while` loop that reads from +stdin and encodes the input data: ```c do { @@ -99,11 +119,15 @@ do { } while (loop); ``` -If the end of the file is reached on standard input (`feof(stdin)`), the loop breaks, indicating that no more data is available. +If the end of the file is reached on standard input (`feof(stdin)`), the loop +breaks, indicating that no more data is available. #### Escaped Code Points Conversion -The `escaped_to_codepoint` function handles the conversion of escape sequences (e.g., '\x41' for 'A') into code points that can be processed. The logic checks for valid escape sequences and builds the code points accordingly. If a broken sequence is detected, it triggers an error: +The `escaped_to_codepoint` function handles the conversion of escape sequences +(e.g., '\x41' for 'A') into code points that can be processed. The logic checks +for valid escape sequences and builds the code points accordingly. If a broken +sequence is detected, it triggers an error: ```c static const lxb_codepoint_t * escaped_to_codepoint(const lxb_char_t *data, ... @@ -115,7 +139,8 @@ if (*state != 0) { ### Finalizing and Outputting -After encoding, the program finalizes the encoded output and writes any remaining data to stdout. This is done using: +After encoding, the program finalizes the encoded output and writes any +remaining data to stdout. This is done using: ```c read_size = lxb_encoding_encode_buf_used(&encode); @@ -126,8 +151,14 @@ if (read_size != 0) { } ``` -This ensures that any data that has not yet been flushed from the buffer is written out before the program exits. +This ensures that any data that has not yet been flushed from the buffer is +written out before the program exits. ## Conclusion -The `encoder.c` file is a functional implementation of an encoding utility using the lexbor library. It effectively handles various character encodings, processes input data in a loop, and provides useful output, making it a useful tool for developers working with different text encodings. The awareness of error handling and usage guidance further enhances its usability in command-line environments. \ No newline at end of file +The `encoder.c` file is a functional implementation of an encoding utility using +the lexbor library. It effectively handles various character encodings, +processes input data in a loop, and provides useful output, making it a useful +tool for developers working with different text encodings. The awareness of +error handling and usage guidance further enhances its usability in command-line +environments. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/validate.md b/source/examples/encoding/buffer/encode/validate.md index 5d497c8..66cbb36 100644 --- a/source/examples/encoding/buffer/encode/validate.md +++ b/source/examples/encoding/buffer/encode/validate.md @@ -1,43 +1,82 @@ # Unicode Encoding Example -This article explains the functionality of a Unicode encoding example, which can be found in the source file [lexbor/encoding/buffer/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/validate.c). The code serves as an illustration of how to encode Unicode code points into a UTF-8 byte string using the Lexbor library. +This article explains the functionality of a Unicode encoding example, which can +be found in the source file +[lexbor/encoding/buffer/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/validate.c). +The code serves as an illustration of how to encode Unicode code points into a +UTF-8 byte string using the Lexbor library. ## Overview -The example demonstrates the process of setting up an encoder, preparing a buffer for the encoded result, and ultimately encoding a series of Unicode code points. The code also highlights error handling when initializing the encoder and configuring it with replacement bytes for invalid code points. +The example demonstrates the process of setting up an encoder, preparing a +buffer for the encoded result, and ultimately encoding a series of Unicode code +points. The code also highlights error handling when initializing the encoder +and configuring it with replacement bytes for invalid code points. ## Code Explanation ### Includes and Macros -The code begins by including necessary header files, specifically `string.h` for string manipulation and `lexbor/encoding/encoding.h` for encoding functions from the Lexbor library. A macro named `FAILED` is defined for error handling, which simplifies reporting errors by outputting a message to `stderr` and exiting the program with a failure status. +The code begins by including necessary header files, specifically `string.h` for +string manipulation and `lexbor/encoding/encoding.h` for encoding functions from +the Lexbor library. A macro named `FAILED` is defined for error handling, which +simplifies reporting errors by outputting a message to `stderr` and exiting the +program with a failure status. ### Main Function -The `main` function encapsulates the entire encoding process. It starts by declaring variables that will be used later, including an `lxb_encoding_encode_t` structure to handle the encoding state, pointers to a list of code points, and a buffer initialized to hold the resulting UTF-8 byte string. +The `main` function encapsulates the entire encoding process. It starts by +declaring variables that will be used later, including an +`lxb_encoding_encode_t` structure to handle the encoding state, pointers to a +list of code points, and a buffer initialized to hold the resulting UTF-8 byte +string. ### Code Points Preparation -A set of Unicode code points is prepared in an array called `cps`, which includes valid points such as Cyrillic characters, a comma, a space, and an exclamation mark. Notably, one of the code points included is `0x110000`, which is invalid. This serves to demonstrate how replacement strategies can be applied when dealing with unexpected values. +A set of Unicode code points is prepared in an array called `cps`, which +includes valid points such as Cyrillic characters, a comma, a space, and an +exclamation mark. Notably, one of the code points included is `0x110000`, which +is invalid. This serves to demonstrate how replacement strategies can be applied +when dealing with unexpected values. ### Encoder Initialization -The code subsequently retrieves the encoding data for UTF-8 using the `lxb_encoding_data` function. The encoder is initialized with `lxb_encoding_encode_init`, which requires the encoder structure, encoding data, a buffer, and the size of that buffer. If initialization fails, the program uses the `FAILED` macro to report the error and terminate. +The code subsequently retrieves the encoding data for UTF-8 using the +`lxb_encoding_data` function. The encoder is initialized with +`lxb_encoding_encode_init`, which requires the encoder structure, encoding data, +a buffer, and the size of that buffer. If initialization fails, the program uses +the `FAILED` macro to report the error and terminate. ### Setting Replacement Bytes -After successful initialization, the example configures the encoder to use specific replacement bytes for invalid code points by invoking `lxb_encoding_encode_replace_set`. This ensures that when an invalid code point is encountered during the encoding process, a predetermined sequence of bytes will replace it. +After successful initialization, the example configures the encoder to use +specific replacement bytes for invalid code points by invoking +`lxb_encoding_encode_replace_set`. This ensures that when an invalid code point +is encountered during the encoding process, a predetermined sequence of bytes +will replace it. ### Encoding Process -A message is printed to indicate the start of the encoding process. The actual encoding is performed using the `encode` function pointer from the encoding data, which takes the encoder structure and a range defined by pointers to the beginning and end of the code points. +A message is printed to indicate the start of the encoding process. The actual +encoding is performed using the `encode` function pointer from the encoding +data, which takes the encoder structure and a range defined by pointers to the +beginning and end of the code points. -If the encoding state indicates an error, it will be silently ignored here since it should not occur in this example. After encoding, the buffer is appropriately terminated with a null byte to signify the end of the string. +If the encoding state indicates an error, it will be silently ignored here since +it should not occur in this example. After encoding, the buffer is appropriately +terminated with a null byte to signify the end of the string. ### Output -Finally, the code loops through the original code points, printing each as a hexadecimal value to the console. It then outputs the resulting UTF-8 string stored in the buffer, demonstrating the successful encoding of the input code points. +Finally, the code loops through the original code points, printing each as a +hexadecimal value to the console. It then outputs the resulting UTF-8 string +stored in the buffer, demonstrating the successful encoding of the input code +points. ## Conclusion -This example showcases how to utilize the Lexbor library to encode Unicode code points into a UTF-8 byte string while implementing error handling and customization through replacement bytes for invalid code points. By following the steps outlined, developers can efficiently manage Unicode data in their applications. \ No newline at end of file +This example showcases how to utilize the Lexbor library to encode Unicode code +points into a UTF-8 byte string while implementing error handling and +customization through replacement bytes for invalid code points. By following +the steps outlined, developers can efficiently manage Unicode data in their +applications. \ No newline at end of file diff --git a/source/examples/encoding/buffer/from_to.md b/source/examples/encoding/buffer/from_to.md index c84c2c4..a3c35ee 100644 --- a/source/examples/encoding/buffer/from_to.md +++ b/source/examples/encoding/buffer/from_to.md @@ -1,46 +1,76 @@ # Encoding Conversion Example -This article describes an example of encoding conversion using the `from_to` program from the `lexbor` library, specifically found in the source file [lexbor/encoding/buffer/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/from_to.c). The program reads data from the standard input, converts the data from one encoding to another (specified by the user), and outputs the result to the standard output. +This article describes an example of encoding conversion using the `from_to` +program from the `lexbor` library, specifically found in the source file +[lexbor/encoding/buffer/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/from_to.c). +The program reads data from the standard input, converts the data from one +encoding to another (specified by the user), and outputs the result to the +standard output. ## Overview -The main function of the program is to facilitate the conversion of text between various character encodings. This operation is critical in environments where data needs to be interpreted correctly across different platforms or applications that utilize specific character encoding schemes. The program checks the validity of input encodings, performs the decode and encode operations, and handles errors appropriately. +The main function of the program is to facilitate the conversion of text between +various character encodings. This operation is critical in environments where +data needs to be interpreted correctly across different platforms or +applications that utilize specific character encoding schemes. The program +checks the validity of input encodings, performs the decode and encode +operations, and handles errors appropriately. ### Major Components 1. **Macro Definition for Error Handling** - A macro named `FAILED` is defined to centralize error handling within the program. It takes a flag (`with_usage`) to determine if usage instructions should be displayed, outputs an error message to `stderr`, and exits the program. This reduces redundancy in error handling and improves code maintainability. + A macro named `FAILED` is defined to centralize error handling within the + program. It takes a flag (`with_usage`) to determine if usage instructions + should be displayed, outputs an error message to `stderr`, and exits the + program. This reduces redundancy in error handling and improves code + maintainability. ```c #define FAILED(with_usage, ...) \ ``` 2. **Usage Function** - The `usage` function prints out how to use the program along with available encoding names. If the required number of arguments is not provided (specifically two arguments for 'from' and 'to'), this function will be invoked to guide the user. + The `usage` function prints out how to use the program along with available + encoding names. If the required number of arguments is not provided + (specifically two arguments for 'from' and 'to'), this function will be + invoked to guide the user. ```c static void usage(void) {...} ``` 3. **Main Function Logic** - The `main` function is where the primary execution occurs. It begins by checking command-line arguments to ensure the user has provided the necessary inputs. The program uses `lxb_encoding_data_by_pre_name` to retrieve encoding information based on user input, and if either input is invalid, it calls the `FAILED` macro. + The `main` function is where the primary execution occurs. It begins by + checking command-line arguments to ensure the user has provided the necessary + inputs. The program uses `lxb_encoding_data_by_pre_name` to retrieve encoding + information based on user input, and if either input is invalid, it calls the + `FAILED` macro. 4. **Initialization of Encoder and Decoder** - Both the encoder and decoder are initialized with their respective encoding data. The decoder will convert input bytes into code points (abstract character representations), while the encoder converts these code points back into byte sequences of the target encoding. + Both the encoder and decoder are initialized with their respective encoding + data. The decoder will convert input bytes into code points (abstract + character representations), while the encoder converts these code points back + into byte sequences of the target encoding. ```c status = lxb_encoding_decode_init(&decode, from, cp, sizeof(cp) / sizeof(lxb_codepoint_t)); ``` 5. **Processing Input Data** - The program reads data from `stdin` in a loop until all input is processed. The decode operation converts the input byte sequence into code points, which are then passed to the encoder to convert into the target encoding. The `fwrite` function is employed to write the output to `stdout`. + The program reads data from `stdin` in a loop until all input is processed. + The decode operation converts the input byte sequence into code points, which + are then passed to the encoder to convert into the target encoding. The + `fwrite` function is employed to write the output to `stdout`. ```c size = fread(inbuf, 1, sizeof(inbuf), stdin); ``` 6. **Finalization** - After all input has been processed, the program ensures that any remaining decoded data is encoded and written to the output. Special care is taken for the `iso-2022-jp` encoding, which may require specific handling to finalize the conversion. + After all input has been processed, the program ensures that any remaining + decoded data is encoded and written to the output. Special care is taken for + the `iso-2022-jp` encoding, which may require specific handling to finalize + the conversion. ```c (void) lxb_encoding_encode_finish(&encode); @@ -48,4 +78,10 @@ The main function of the program is to facilitate the conversion of text between ## Conclusion -The `from_to` example illustrates how to adeptly handle encoding conversions in C using the lexbor library. By providing a structured way to manage different encodings and offering clear error handling, this example serves as a foundational component in the development of applications that require text data manipulation across various encodings. The modular approach allows enhancements to be easily integrated, such as supporting additional encodings or modifying the input/output methods. \ No newline at end of file +The `from_to` example illustrates how to adeptly handle encoding conversions in +C using the lexbor library. By providing a structured way to manage different +encodings and offering clear error handling, this example serves as a +foundational component in the development of applications that require text data +manipulation across various encodings. The modular approach allows enhancements +to be easily integrated, such as supporting additional encodings or modifying +the input/output methods. \ No newline at end of file diff --git a/source/examples/encoding/data_by_name.md b/source/examples/encoding/data_by_name.md index 8de6afd..8f308cf 100644 --- a/source/examples/encoding/data_by_name.md +++ b/source/examples/encoding/data_by_name.md @@ -1,10 +1,17 @@ # Encoding Data Retrieval Example -This article provides an explanation of an example from the file [lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c). The purpose of this code is to demonstrate how to retrieve encoding data by its name using the Lexbor encoding library. The code illustrated here highlights the procedure for accessing character encoding information, specifically focusing on UTF-8. +This article provides an explanation of an example from the file +[lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c). +The purpose of this code is to demonstrate how to retrieve encoding data by its +name using the Lexbor encoding library. The code illustrated here highlights the +procedure for accessing character encoding information, specifically focusing on +UTF-8. ## Code Explanation -The program starts with the necessary `#include` directive, which includes the Lexbor encoding library header file. This library provides the functionality needed to work with different character encodings. +The program starts with the necessary `#include` directive, which includes the +Lexbor encoding library header file. This library provides the functionality +needed to work with different character encodings. ### Main Function @@ -14,7 +21,9 @@ The `main` function serves as the entry point of the program: int main(int argc, const char *argv[]) ``` -Here, it accepts two parameters: the argument count `argc` and an array of argument strings `argv`. Although the parameters are not utilized in this example, they are typically included for potential command-line functionality. +Here, it accepts two parameters: the argument count `argc` and an array of +argument strings `argv`. Although the parameters are not utilized in this +example, they are typically included for potential command-line functionality. ### Retrieving Encoding Data @@ -25,9 +34,15 @@ const lxb_encoding_data_t *enc_data; enc_data = lxb_encoding_data_by_name((lxb_char_t *) "uTf-8", 5); ``` -In this segment, the variable `enc_data` is declared as a pointer to `lxb_encoding_data_t`, which represents the encoding data structure in Lexbor. The function `lxb_encoding_data_by_name` is called with two arguments: the string "uTf-8" (with a deliberate mixed case) and the length of the string, which is `5`. +In this segment, the variable `enc_data` is declared as a pointer to +`lxb_encoding_data_t`, which represents the encoding data structure in Lexbor. +The function `lxb_encoding_data_by_name` is called with two arguments: the +string "uTf-8" (with a deliberate mixed case) and the length of the string, +which is `5`. -This function attempts to retrieve encoding data corresponding to the specified name. If the name provided does not match any available encoding in the library, the function will return `NULL`. +This function attempts to retrieve encoding data corresponding to the specified +name. If the name provided does not match any available encoding in the library, +the function will return `NULL`. ### Error Handling @@ -39,17 +54,22 @@ if (enc_data == NULL) { } ``` -If `enc_data` is `NULL`, the program terminates with a failure status. This is an important check to ensure that the encoding has been found before attempting to access any of its properties, thus preventing potential runtime errors. +If `enc_data` is `NULL`, the program terminates with a failure status. This is +an important check to ensure that the encoding has been found before attempting +to access any of its properties, thus preventing potential runtime errors. ### Output Encoding Name -Upon successful retrieval of the encoding data, the program proceeds to print the name of the encoding: +Upon successful retrieval of the encoding data, the program proceeds to print +the name of the encoding: ```c printf("%s\n", enc_data->name); ``` -This line outputs the name of the encoding that has been retrieved, which in this case would be "UTF-8", assuming the spelling was correct in the function call. +This line outputs the name of the encoding that has been retrieved, which in +this case would be "UTF-8", assuming the spelling was correct in the function +call. ### Exit Status @@ -59,8 +79,15 @@ Finally, the program completes its execution successfully: return EXIT_SUCCESS; ``` -This line returns a success status to the operating system, indicating that the program has run without any issues. +This line returns a success status to the operating system, indicating that the +program has run without any issues. ## Conclusion -The example presented in [lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c) effectively demonstrates how to access encoding data using the Lexbor encoding library. It showcases the importance of error handling and provides a simple way to retrieve and display the name of a character encoding, using UTF-8 as a practical example. This code can serve as a foundational component for applications that require encoding information for text processing. \ No newline at end of file +The example presented in +[lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c) +effectively demonstrates how to access encoding data using the Lexbor encoding +library. It showcases the importance of error handling and provides a simple way +to retrieve and display the name of a character encoding, using UTF-8 as a +practical example. This code can serve as a foundational component for +applications that require encoding information for text processing. \ No newline at end of file diff --git a/source/examples/encoding/index.md b/source/examples/encoding/index.md index 6875d0a..280f861 100644 --- a/source/examples/encoding/index.md +++ b/source/examples/encoding/index.md @@ -1,6 +1,7 @@ # Encoding Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 1 diff --git a/source/examples/encoding/single/decode/decode.md b/source/examples/encoding/single/decode/decode.md index fd3fede..a57405c 100644 --- a/source/examples/encoding/single/decode/decode.md +++ b/source/examples/encoding/single/decode/decode.md @@ -1,10 +1,17 @@ # UTF-8 Decoding Example -This article explains a code example from [lexbor/encoding/single/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decode.c), which demonstrates how to decode a UTF-8 string into its respective code points using the lexbor library. +This article explains a code example from +[lexbor/encoding/single/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decode.c), +which demonstrates how to decode a UTF-8 string into its respective code points +using the lexbor library. ## Introduction -The primary purpose of this code is to decode a UTF-8 encoded string, specifically the phrase "Привет, мир!" (which means "Hello, world!" in Russian), into individual Unicode code points. It showcases the initialization of the decoder, the processing of the input string, and outputting the results in a formatted manner. +The primary purpose of this code is to decode a UTF-8 encoded string, +specifically the phrase "Привет, мир!" (which means "Hello, world!" in Russian), +into individual Unicode code points. It showcases the initialization of the +decoder, the processing of the input string, and outputting the results in a +formatted manner. ## Code Explanation @@ -16,7 +23,8 @@ The necessary header file is included at the beginning of the code: #include ``` -This header provides the necessary declarations for working with encoding functionalities offered by lexbor. +This header provides the necessary declarations for working with encoding +functionalities offered by lexbor. ### Error Handling Macro @@ -32,7 +40,9 @@ The code defines a macro for error handling: while (0) ``` -This macro outputs an error message to the standard error stream and exits the program if a failure condition is met. It streamlines error handling throughout the code. +This macro outputs an error message to the standard error stream and exits the +program if a failure condition is met. It streamlines error handling throughout +the code. ### Main Function @@ -53,18 +63,21 @@ Several variables are declared to handle the decoding process, including: - `lxb_status_t status;`: Holds the status of operations. - `lxb_encoding_decode_t decode;`: The decoder instance. - `const lxb_encoding_data_t *encoding;`: Pointer to the encoding data. -- `const lxb_char_t *pos;`: Pointer to track the current position in the input data. +- `const lxb_char_t *pos;`: Pointer to track the current position in the input + data. ### Preparing the Input Buffer -The input UTF-8 string is initialized, along with a pointer to the end of the string: +The input UTF-8 string is initialized, along with a pointer to the end of the +string: ```c const lxb_char_t *data = (const lxb_char_t *) "Привет, мир!"; const lxb_char_t *end = data + strlen((char *) data); ``` -The `strlen` function determines the length of the string to establish the end of the data. +The `strlen` function determines the length of the string to establish the end +of the data. ### Setting Up the Encoding @@ -74,7 +87,8 @@ The program retrieves UTF-8 encoding data with: encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); ``` -This function sets up the necessary encoding data for subsequent decoding operations. +This function sets up the necessary encoding data for subsequent decoding +operations. ### Initializing the Decoder @@ -87,11 +101,13 @@ if (status != LXB_STATUS_OK) { } ``` -If the initialization fails, the program invokes the `FAILED` macro to print the error and exit. +If the initialization fails, the program invokes the `FAILED` macro to print the +error and exit. ### Decoding Loop -Following initialization, the program enters a loop to decode each character in the input string: +Following initialization, the program enters a loop to decode each character in +the input string: ```c while (data < end) { @@ -99,13 +115,16 @@ while (data < end) { } ``` -Inside the loop, the current position (`pos`) is recorded, and the decoding function is called: +Inside the loop, the current position (`pos`) is recorded, and the decoding +function is called: ```c cp = encoding->decode_single(&decode, &data, end); ``` -This line decodes a single UTF-8 character, advancing the input pointer `data` as needed. The result is checked against a maximum allowable code point value, although in this example, that condition is expected never to occur. +This line decodes a single UTF-8 character, advancing the input pointer `data` +as needed. The result is checked against a maximum allowable code point value, +although in this example, that condition is expected never to occur. ### Outputting the Results @@ -115,8 +134,12 @@ For each decoded character, the code prints the results to the standard output: printf("%.*s: 0x%04X\n", (int) (data - pos), pos, cp); ``` -This formatted output provides both the original UTF-8 character (as a substring) and its corresponding Unicode code point in hexadecimal format. +This formatted output provides both the original UTF-8 character (as a +substring) and its corresponding Unicode code point in hexadecimal format. ## Conclusion -The example demonstrates a straightforward approach to decoding a UTF-8 string into Unicode code points using the lexbor library. It effectively showcases initialization, error handling, and character decoding, providing a practical illustration of working with character encodings in C. \ No newline at end of file +The example demonstrates a straightforward approach to decoding a UTF-8 string +into Unicode code points using the lexbor library. It effectively showcases +initialization, error handling, and character decoding, providing a practical +illustration of working with character encodings in C. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/decoder.md b/source/examples/encoding/single/decode/decoder.md index 5657c56..1e1e42b 100644 --- a/source/examples/encoding/single/decode/decoder.md +++ b/source/examples/encoding/single/decode/decoder.md @@ -1,20 +1,28 @@ # Encoding Decoder Example -In this article, we will explore the encoding decoder example found in the file [lexbor/encoding/single/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decoder.c). This code demonstrates how to decode input data from standard input according to a specified character encoding. It provides a useful utility for developers needing to handle various text encodings in their applications. +In this article, we will explore the encoding decoder example found in the file +[lexbor/encoding/single/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decoder.c). +This code demonstrates how to decode input data from standard input according to +a specified character encoding. It provides a useful utility for developers +needing to handle various text encodings in their applications. ## Code Overview -The main function of this code is to read data from standard input, decode it according to the specified encoding, and print the corresponding Unicode values. It uses the Lexbor library to facilitate this process. +The main function of this code is to read data from standard input, decode it +according to the specified encoding, and print the corresponding Unicode values. +It uses the Lexbor library to facilitate this process. ### Header and Includes -At the beginning of the file, we find the licensing information and the inclusion of the Lexbor encoding header: +At the beginning of the file, we find the licensing information and the +inclusion of the Lexbor encoding header: ```c #include ``` -This inclusion allows access to functions and definitions related to text encoding and decoding. +This inclusion allows access to functions and definitions related to text +encoding and decoding. ### Error Handling Macro @@ -35,7 +43,10 @@ A macro named `FAILED` is defined to streamline error management: while (0) ``` -This macro takes a condition (`with_usage`) and, upon failure, prints an error message to standard error, optionally displays usage instructions, and exits the program with a failure status. This convenient encapsulation enhances code readability and maintainability. +This macro takes a condition (`with_usage`) and, upon failure, prints an error +message to standard error, optionally displays usage instructions, and exits the +program with a failure status. This convenient encapsulation enhances code +readability and maintainability. ### Usage Function @@ -50,7 +61,9 @@ static void usage(void) } ``` -This function provides users with information about how to use the decoder program and lists the available character encodings that can be specified as command-line arguments. +This function provides users with information about how to use the decoder +program and lists the available character encodings that can be specified as +command-line arguments. ### Main Function Structure @@ -84,11 +97,13 @@ if (argc != 2) { } ``` -If no encoding is specified, it invokes the `usage` function and exits gracefully. +If no encoding is specified, it invokes the `usage` function and exits +gracefully. #### Encoding Detection -Next, the program attempts to identify the desired encoding based on the provided name: +Next, the program attempts to identify the desired encoding based on the +provided name: ```c encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], @@ -98,7 +113,8 @@ if (encoding == NULL) { } ``` -If the specified encoding is not recognized, it triggers the `FAILED` macro, providing feedback to the user. +If the specified encoding is not recognized, it triggers the `FAILED` macro, +providing feedback to the user. #### Decoder Initialization @@ -111,7 +127,8 @@ if (status != LXB_STATUS_OK) { } ``` -This step configures the decoder to use the chosen encoding. If the initialization fails, the program prints an error and exits. +This step configures the decoder to use the chosen encoding. If the +initialization fails, the program prints an error and exits. ### Data Reading and Decoding Loop @@ -131,11 +148,13 @@ do { Within this loop: - Data is read into a buffer (`inbuf`). - Each code point is decoded using the `decode_single` method. -- Based on the value of `cp`, different output formats are printed for Unicode and ASCII characters. +- Based on the value of `cp`, different output formats are printed for Unicode + and ASCII characters. ### Output and Continuation -Finally, the program checks if the decoding process requires continuation, outputting a replacement character where necessary: +Finally, the program checks if the decoding process requires continuation, +outputting a replacement character where necessary: ```c if (cp == LXB_ENCODING_DECODE_CONTINUE) { @@ -145,4 +164,7 @@ if (cp == LXB_ENCODING_DECODE_CONTINUE) { ### Conclusion -By effectively using the Lexbor library's encoding functionalities, this code provides a flexible and powerful example of how to decode various text encodings from standard input. Developers can adapt this example for their applications, thereby enhancing their ability to handle encoded text data efficiently. \ No newline at end of file +By effectively using the Lexbor library's encoding functionalities, this code +provides a flexible and powerful example of how to decode various text encodings +from standard input. Developers can adapt this example for their applications, +thereby enhancing their ability to handle encoded text data efficiently. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/validate.md b/source/examples/encoding/single/decode/validate.md index 5b04969..62800de 100644 --- a/source/examples/encoding/single/decode/validate.md +++ b/source/examples/encoding/single/decode/validate.md @@ -1,14 +1,24 @@ # UTF-8 Decoding and Validation Example -This article explains an example of decoding and validating a UTF-8 string, using the Lexbor library. The source file for this code example is [lexbor/encoding/single/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/validate.c). The primary objective of this code is to demonstrate how to properly decode a UTF-8 encoded string, handle decoding errors, and output both valid code points and error information for invalid byte sequences. +This article explains an example of decoding and validating a UTF-8 string, +using the Lexbor library. The source file for this code example is +[lexbor/encoding/single/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/validate.c). +The primary objective of this code is to demonstrate how to properly decode a +UTF-8 encoded string, handle decoding errors, and output both valid code points +and error information for invalid byte sequences. ## Code Breakdown -The example begins with necessary includes and macro definitions. It imports the required header file for Lexbor encoding and defines a macro `FAILED` that handles error reporting and terminates the program if an error occurs. +The example begins with necessary includes and macro definitions. It imports the +required header file for Lexbor encoding and defines a macro `FAILED` that +handles error reporting and terminates the program if an error occurs. ### Setting Up the Main Function -The `main` function initializes variables needed for decoding. Here, `lxb_status_t status`, `lxb_codepoint_t cp`, and `lxb_encoding_decode_t decode` are declared. Additionally, a pointer to encoding data will be initialized as the UTF-8 encoding. +The `main` function initializes variables needed for decoding. Here, +`lxb_status_t status`, `lxb_codepoint_t cp`, and `lxb_encoding_decode_t decode` +are declared. Additionally, a pointer to encoding data will be initialized as +the UTF-8 encoding. ```c lxb_status_t status; @@ -19,7 +29,10 @@ const lxb_encoding_data_t *encoding; ### Preparing the Data Buffer -The code prepares a buffer containing the string "Привет,\x80 мир!". The string contains a valid UTF-8 sequence followed by an invalid byte sequence (0x80). The end of the buffer is determined using `strlen` to ensure the decoding process will iterate through the entire string. +The code prepares a buffer containing the string "Привет,\x80 мир!". The string +contains a valid UTF-8 sequence followed by an invalid byte sequence (0x80). The +end of the buffer is determined using `strlen` to ensure the decoding process +will iterate through the entire string. ```c const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; @@ -28,7 +41,10 @@ const lxb_char_t *end = data + strlen((char *) data); ### Initializing the Decoder -The encoding is initialized with `lxb_encoding_data(LXB_ENCODING_UTF_8)`, and the decoder is set up using the function `lxb_encoding_decode_init_single`. If initialization fails, the `FAILED` macro reports the error and exits the program. +The encoding is initialized with `lxb_encoding_data(LXB_ENCODING_UTF_8)`, and +the decoder is set up using the function `lxb_encoding_decode_init_single`. If +initialization fails, the `FAILED` macro reports the error and exits the +program. ```c encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); @@ -40,7 +56,9 @@ if (status != LXB_STATUS_OK) { ### Decoding Process -The core loop of the example begins, where the program continuously decodes until the end of the data buffer is reached. Each iteration decodes a single code point from the UTF-8 data. +The core loop of the example begins, where the program continuously decodes +until the end of the data buffer is reached. Each iteration decodes a single +code point from the UTF-8 data. ```c while (data < end) { @@ -49,7 +67,11 @@ while (data < end) { } ``` -If a valid code point is within the acceptable range defined by `LXB_ENCODING_DECODE_MAX_CODEPOINT`, it gets printed together with the decoded UTF-8 sequence. If an invalid byte sequence is encountered that exceeds the maximum code point, it prints an error message indicating the bad byte sequences. +If a valid code point is within the acceptable range defined by +`LXB_ENCODING_DECODE_MAX_CODEPOINT`, it gets printed together with the decoded +UTF-8 sequence. If an invalid byte sequence is encountered that exceeds the +maximum code point, it prints an error message indicating the bad byte +sequences. ```c if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { @@ -62,4 +84,9 @@ if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { ### Conclusion -The program concludes by returning a success status if all decoding operations complete without errors. In summary, this code serves as an illustrative example of how to utilize the Lexbor encoding library to decode and validate UTF-8 encoded strings effectively, while properly handling potential errors in byte sequences. By implementing this method, developers can ensure their applications correctly interpret and display UTF-8 content. \ No newline at end of file +The program concludes by returning a success status if all decoding operations +complete without errors. In summary, this code serves as an illustrative example +of how to utilize the Lexbor encoding library to decode and validate UTF-8 +encoded strings effectively, while properly handling potential errors in byte +sequences. By implementing this method, developers can ensure their applications +correctly interpret and display UTF-8 content. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encode.md b/source/examples/encoding/single/encode/encode.md index 80581b2..968540d 100644 --- a/source/examples/encoding/single/encode/encode.md +++ b/source/examples/encoding/single/encode/encode.md @@ -1,36 +1,56 @@ # UTF-8 Encoding Example -This article explains the purpose and functionality of the UTF-8 encoding example provided in the file [lexbor/encoding/single/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/encode.c). The code demonstrates how to encode a series of Unicode code points into a UTF-8 byte string using the Lexbor encoding library. +This article explains the purpose and functionality of the UTF-8 encoding +example provided in the file +[lexbor/encoding/single/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/encode.c). +The code demonstrates how to encode a series of Unicode code points into a UTF-8 +byte string using the Lexbor encoding library. ## Code Overview -The program begins by including the necessary header file for the Lexbor encoding library. It defines a macro for error handling named `FAILED`, which simplifies printing error messages and terminating the program if initialization or execution fails. +The program begins by including the necessary header file for the Lexbor +encoding library. It defines a macro for error handling named `FAILED`, which +simplifies printing error messages and terminating the program if initialization +or execution fails. ### Main Function Structure -The `main` function serves as the entry point of the program. It declares several variables needed for encoding, including a buffer for the output and an encoder instance. The following key steps are involved in the encoding process: +The `main` function serves as the entry point of the program. It declares +several variables needed for encoding, including a buffer for the output and an +encoder instance. The following key steps are involved in the encoding process: -1. **Buffer Preparation**: - A buffer of 1024 bytes is allocated to hold the UTF-8 encoded string. The variables `data` and `end` are set to track the start and the end of the buffer. +1. **Buffer Preparation**: A buffer of 1024 bytes is allocated to hold the UTF-8 + encoded string. The variables `data` and `end` are set to track the start and + the end of the buffer. -2. **Unicode Code Points**: - An array of Unicode code points is defined and terminated with a zero. These code points (e.g., Cyrillic characters for "Привет, мир!") are the values that will be encoded. +2. **Unicode Code Points**: An array of Unicode code points is defined and + terminated with a zero. These code points (e.g., Cyrillic characters for + "Привет, мир!") are the values that will be encoded. -3. **Encoding Initialization**: - The function `lxb_encoding_data` retrieves the encoding data for UTF-8, which is passed to `lxb_encoding_encode_init_single` to initialize the encoder. If the initialization fails, the `FAILED` macro is invoked to handle the error. +3. **Encoding Initialization**: The function `lxb_encoding_data` retrieves the + encoding data for UTF-8, which is passed to `lxb_encoding_encode_init_single` + to initialize the encoder. If the initialization fails, the `FAILED` macro is + invoked to handle the error. -4. **Encoding Loop**: - The program enters a loop where each code point is processed for encoding: +4. **Encoding Loop**: The program enters a loop where each code point is + processed for encoding: - The current position in the buffer (`pos`) is saved. - - The encoder's `encode_single` function is called to perform the encoding. The length of the encoded output is returned. - - If the encoding operation is successful, the resulting UTF-8 bytes are printed alongside their corresponding Unicode code point in hexadecimal format. + - The encoder's `encode_single` function is called to perform the encoding. + The length of the encoded output is returned. + - If the encoding operation is successful, the resulting UTF-8 bytes are + printed alongside their corresponding Unicode code point in hexadecimal + format. -5. **String Termination**: - After processing all code points, the buffer is null-terminated to ensure it is properly formatted as a C string. +5. **String Termination**: After processing all code points, the buffer is + null-terminated to ensure it is properly formatted as a C string. -6. **Output Display**: - Finally, the UTF-8 encoded string is printed to the console, demonstrating the successful encoding of the provided Unicode code points. +6. **Output Display**: Finally, the UTF-8 encoded string is printed to the + console, demonstrating the successful encoding of the provided Unicode code + points. ## Conclusion -Upon reaching the end of the program, it exits gracefully, indicating successful execution. This example illustrates how to use the Lexbor encoding library for converting Unicode code points to a UTF-8 encoded string, providing a clear and practical implementation of encoding functionality in C using Lexbor. \ No newline at end of file +Upon reaching the end of the program, it exits gracefully, indicating successful +execution. This example illustrates how to use the Lexbor encoding library for +converting Unicode code points to a UTF-8 encoded string, providing a clear and +practical implementation of encoding functionality in C using Lexbor. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encoder.md b/source/examples/encoding/single/encode/encoder.md index e43191d..4371536 100644 --- a/source/examples/encoding/single/encode/encoder.md +++ b/source/examples/encoding/single/encode/encoder.md @@ -1,12 +1,20 @@ # Encoding Input Data Example -This article explains the purpose and functionality of the `encoder.c` source file located in the `lexbor/encoding/single/encode` directory. The code provides a utility for encoding text input based on a specified character encoding scheme. It reads data from standard input (stdin), decodes any escaped code points in the input, and encodes the results according to the selected encoding. +This article explains the purpose and functionality of the `encoder.c` source +file located in the `lexbor/encoding/single/encode` directory. The code provides +a utility for encoding text input based on a specified character encoding +scheme. It reads data from standard input (stdin), decodes any escaped code +points in the input, and encodes the results according to the selected encoding. ## Key Components ### Header and Macros -The file begins with some header information including copyright and the author's details. Following this, necessary includes and definitions are placed. The macro `FAILED` is defined to handle error reporting and exit when a critical failure occurs. This block of code succinctly prints an error message, displays usage instructions if required, and terminates the program: +The file begins with some header information including copyright and the +author's details. Following this, necessary includes and definitions are placed. +The macro `FAILED` is defined to handle error reporting and exit when a critical +failure occurs. This block of code succinctly prints an error message, displays +usage instructions if required, and terminates the program: ```c #define FAILED(with_usage, ...) \ @@ -23,7 +31,10 @@ The file begins with some header information including copyright and the author' ### Usage Function -The `usage` function outputs the required command-line usage for the program, listing all of the available encodings such as `UTF-8`, `ISO-8859-1`, and `SHIFT-JIS`. This function is invoked if the user does not supply the required arguments. +The `usage` function outputs the required command-line usage for the program, +listing all of the available encodings such as `UTF-8`, `ISO-8859-1`, and +`SHIFT-JIS`. This function is invoked if the user does not supply the required +arguments. ```c static void usage(void) { @@ -34,7 +45,12 @@ static void usage(void) { ### Escaped Code Point Conversion -The function `escaped_to_codepoint` is responsible for converting escaped Unicode sequences to their corresponding code points. The function processes the input data character by character, identifying whether the sequence starts with a backslash, and checking for either hexadecimal (`\x`) or Unicode (`\u`) formats. If an incorrectly formatted escape sequence is detected, an error state is triggered prompting the program to exit: +The function `escaped_to_codepoint` is responsible for converting escaped +Unicode sequences to their corresponding code points. The function processes the +input data character by character, identifying whether the sequence starts with +a backslash, and checking for either hexadecimal (`\x`) or Unicode (`\u`) +formats. If an incorrectly formatted escape sequence is detected, an error state +is triggered prompting the program to exit: ```c static const lxb_char_t *escaped_to_codepoint(const lxb_char_t *data, const lxb_char_t *end, @@ -52,9 +68,12 @@ static const lxb_char_t *escaped_to_codepoint(const lxb_char_t *data, const lxb_ The `main` function orchestrates the entire encoding process: -1. **Argument Handling**: It requires one argument indicating the desired encoding. -2. **Encoding Setup**: It retrieves the encoding configuration using the provided argument and initializes the encoder. -3. **Input Loop**: The program enters a loop where it reads input data from stdin, processes it into code points, and then encodes these points: +1. **Argument Handling**: It requires one argument indicating the desired + encoding. +2. **Encoding Setup**: It retrieves the encoding configuration using the + provided argument and initializes the encoder. +3. **Input Loop**: The program enters a loop where it reads input data from + stdin, processes it into code points, and then encodes these points: ```c while (data < end) { @@ -63,8 +82,15 @@ The `main` function orchestrates the entire encoding process: } ``` -4. **Output Handling**: The encoded output is written to stdout. If the encoding is `UTF-8`, replacement bytes are used as necessary. +4. **Output Handling**: The encoded output is written to stdout. If the encoding + is `UTF-8`, replacement bytes are used as necessary. -Overall, the program is designed to robustly handle input encoding, managing possible errors during reading and writing, and validating formats. The use of the `lexbor` library enables effective encoding management, providing a variety of supported character encodings. +Overall, the program is designed to robustly handle input encoding, managing +possible errors during reading and writing, and validating formats. The use of +the `lexbor` library enables effective encoding management, providing a variety +of supported character encodings. -In conclusion, the `encoder.c` file serves as a practical example of encoding conversion using a command-line utility, highlighting important coding principles, such as error handling, input/output operations, and state management within the context of encoding mechanisms. \ No newline at end of file +In conclusion, the `encoder.c` file serves as a practical example of encoding +conversion using a command-line utility, highlighting important coding +principles, such as error handling, input/output operations, and state +management within the context of encoding mechanisms. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/validate.md b/source/examples/encoding/single/encode/validate.md index eadf876..e2beeaf 100644 --- a/source/examples/encoding/single/encode/validate.md +++ b/source/examples/encoding/single/encode/validate.md @@ -1,10 +1,19 @@ # Encoding Unicode Code Points to UTF-8 Example -This example demonstrates how to validate and encode Unicode code points into a UTF-8 byte string using the lexbor library. The functionality is encapsulated within a C program located in the [lexbor/encoding/single/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/validate.c) file. The purpose of this code is to illustrate the encoding of a set of given code points, handling exceptions for those that are invalid by replacing them with a predefined replacement character. +This example demonstrates how to validate and encode Unicode code points into a +UTF-8 byte string using the lexbor library. The functionality is encapsulated +within a C program located in the +[lexbor/encoding/single/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/validate.c) +file. The purpose of this code is to illustrate the encoding of a set of given +code points, handling exceptions for those that are invalid by replacing them +with a predefined replacement character. ## Overview of the Code -The code begins by including the necessary header files from the lexbor library, specifically targeting encoding functionality. It subsequently defines a macro for error handling, which outputs an error message to `stderr` and exits the program with a failure status. +The code begins by including the necessary header files from the lexbor library, +specifically targeting encoding functionality. It subsequently defines a macro +for error handling, which outputs an error message to `stderr` and exits the +program with a failure status. ### Variable Declarations @@ -12,35 +21,55 @@ The `main` function sets up various variables needed for the encoding process: - `len`: This variable stores the length of the encoded string. - `status`: Utilized for capturing the status of encoding operations. -- `encode`: An instance of `lxb_encoding_encode_t`, used to manage encoding context. +- `encode`: An instance of `lxb_encoding_encode_t`, used to manage encoding + context. - `encoding`: A pointer to the appropriate encoding data. - `pos`: A pointer that tracks the current position in the output buffer. ### Buffer Preparation -A buffer (`buffer`) of 1024 `lxb_char_t` elements is defined to hold the resulting UTF-8 byte string. Pointers are initialized to manage the writing process into this buffer safely. +A buffer (`buffer`) of 1024 `lxb_char_t` elements is defined to hold the +resulting UTF-8 byte string. Pointers are initialized to manage the writing +process into this buffer safely. ### Unicode Code Points -An array of Unicode code points is declared, which includes both valid and an intentionally invalid code point (`0x110000`). This is to illustrate how the code handles bad input during encoding. +An array of Unicode code points is declared, which includes both valid and an +intentionally invalid code point (`0x110000`). This is to illustrate how the +code handles bad input during encoding. ### Encoding Initialization -The code retrieves the UTF-8 encoding data using `lxb_encoding_data(LXB_ENCODING_UTF_8)` and initializes the encoding context with `lxb_encoding_encode_init_single(&encode, encoding)`. If this initialization fails, an error message is reported, and the program exits. +The code retrieves the UTF-8 encoding data using +`lxb_encoding_data(LXB_ENCODING_UTF_8)` and initializes the encoding context +with `lxb_encoding_encode_init_single(&encode, encoding)`. If this +initialization fails, an error message is reported, and the program exits. ### Encoding Loop -The core functionality is encapsulated in a loop that processes each code point from the `cps` array: - -1. **Position Tracking**: The position pointer `pos` is reset to the current data pointer at the start of the loop iteration. -2. **Encoding**: Each code point is encoded using the `encode_single` method. The returned `len` represents the number of bytes written to the buffer. -3. **Error Handling**: If `len` indicates a problem (less than `LXB_ENCODING_ENCODE_OK`), the code checks for buffer size issues (though this example does not expect to encounter this). If the code point is invalid, it prints an error message along with a replacement character output, handling the invalid code point scenario gracefully. -4. **Output**: For valid code points, the program prints the code point and its corresponding UTF-8 representation. +The core functionality is encapsulated in a loop that processes each code point +from the `cps` array: + +1. **Position Tracking**: The position pointer `pos` is reset to the current + data pointer at the start of the loop iteration. +2. **Encoding**: Each code point is encoded using the `encode_single` method. + The returned `len` represents the number of bytes written to the buffer. +3. **Error Handling**: If `len` indicates a problem (less than + `LXB_ENCODING_ENCODE_OK`), the code checks for buffer size issues (though + this example does not expect to encounter this). If the code point is + invalid, it prints an error message along with a replacement character + output, handling the invalid code point scenario gracefully. +4. **Output**: For valid code points, the program prints the code point and its + corresponding UTF-8 representation. ### Finalization -After processing all code points, the program terminates the string by setting the last byte of the buffer to `0x00`. It then prints the final UTF-8 result. +After processing all code points, the program terminates the string by setting +the last byte of the buffer to `0x00`. It then prints the final UTF-8 result. ## Conclusion -The program effectively showcases how to handle Unicode encoding with proper error management for invalid inputs. This example is particularly useful for developers using the lexbor library to manage character encodings, providing insight on validating and encoding procedures in C. \ No newline at end of file +The program effectively showcases how to handle Unicode encoding with proper +error management for invalid inputs. This example is particularly useful for +developers using the lexbor library to manage character encodings, providing +insight on validating and encoding procedures in C. \ No newline at end of file diff --git a/source/examples/encoding/single/from_to.md b/source/examples/encoding/single/from_to.md index 08968fa..7339678 100644 --- a/source/examples/encoding/single/from_to.md +++ b/source/examples/encoding/single/from_to.md @@ -1,22 +1,33 @@ # Encoding Conversion Example -This article explains the encoding conversion functionality provided in the source file [lexbor/encoding/single/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/from_to.c). The code allows users to convert text from one character encoding to another via command-line input. It demonstrates how to utilize the Lexbor encoding library for encoding and decoding different formats of character sets. +This article explains the encoding conversion functionality provided in the +source file +[lexbor/encoding/single/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/from_to.c). +The code allows users to convert text from one character encoding to another via +command-line input. It demonstrates how to utilize the Lexbor encoding library +for encoding and decoding different formats of character sets. ## Overview -The main function in this code receives two command-line arguments representing the source (`from`) and target (`to`) encodings. It reads input data from standard input, decodes it from the specified `from` encoding to Unicode code points, and then encodes those code points into the specified `to` encoding before writing the output to standard output. +The main function in this code receives two command-line arguments representing +the source (`from`) and target (`to`) encodings. It reads input data from +standard input, decodes it from the specified `from` encoding to Unicode code +points, and then encodes those code points into the specified `to` encoding +before writing the output to standard output. ## Code Breakdown ### Definitions and Includes -At the beginning of the file, we include the necessary header for the Lexbor encoding module: +At the beginning of the file, we include the necessary header for the Lexbor +encoding module: ```c #include ``` -This allows us access to various functions and types defined in the library, which facilitate character encoding tasks. +This allows us access to various functions and types defined in the library, +which facilitate character encoding tasks. ### Failure Handling Macro @@ -26,17 +37,23 @@ The `FAILED` macro is defined for error handling throughout the code: #define FAILED(with_usage, ...) ... ``` -This macro simplifies error reporting by printing error messages to standard error and conditionally calling the `usage` function to display usage instructions before terminating the program. Adopting this macro ensures a consistent approach to error handling across the code. +This macro simplifies error reporting by printing error messages to standard +error and conditionally calling the `usage` function to display usage +instructions before terminating the program. Adopting this macro ensures a +consistent approach to error handling across the code. ### Usage Function -The `usage` function provides instructions on how to use the encoding conversion tool: +The `usage` function provides instructions on how to use the encoding conversion +tool: ```c static void usage(void) { ... } ``` -It lists the accepted input encodings that users can specify when executing the program. This function is crucial for user guidance, ensuring that they know the correct format for command inputs. +It lists the accepted input encodings that users can specify when executing the +program. This function is crucial for user guidance, ensuring that they know the +correct format for command inputs. ### Main Function @@ -46,29 +63,37 @@ The `main` function orchestrates the overall process: int main(int argc, const char *argv[]) { ... } ``` -1. **Argument Count Check**: The function starts by checking if the user provided exactly two arguments (the source and target encodings). If not, the `usage` function is called, and the program exits. +1. **Argument Count Check**: The function starts by checking if the user + provided exactly two arguments (the source and target encodings). If not, the + `usage` function is called, and the program exits. -2. **Encoding Data Retrieval**: The code fetches the encoding information for both the source and target encodings using the `lxb_encoding_data_by_pre_name` function: +2. **Encoding Data Retrieval**: The code fetches the encoding information for + both the source and target encodings using the + `lxb_encoding_data_by_pre_name` function: ```c from = lxb_encoding_data_by_pre_name(...); to = lxb_encoding_data_by_pre_name(...); ``` - If either retrieval fails, the `FAILED` macro is triggered, stopping execution. + If either retrieval fails, the `FAILED` macro is triggered, stopping + execution. -3. **Initialization of Encoder and Decoder**: The encoder and decoder are initialized with the retrieved encoding data: +3. **Initialization of Encoder and Decoder**: The encoder and decoder are + initialized with the retrieved encoding data: ```c status = lxb_encoding_encode_init_single(&encode, to); status = lxb_encoding_decode_init_single(&decode, from); ``` - These initializations set up the necessary state for encoding and decoding operations. + These initializations set up the necessary state for encoding and decoding + operations. ### Input Reading and Processing Loop -The program enters a loop where it continuously reads from standard input until EOF (End Of File) is reached: +The program enters a loop where it continuously reads from standard input until +EOF (End Of File) is reached: ```c do { @@ -79,9 +104,11 @@ do { Within the loop: -- The fetched data is decoded using the `from` encoder to obtain Unicode code points. +- The fetched data is decoded using the `from` encoder to obtain Unicode code + points. -- For each code point decoded, it is then encoded with the `to` encoder and written to standard output. +- For each code point decoded, it is then encoded with the `to` encoder and + written to standard output. ### Finalization @@ -92,8 +119,14 @@ status = lxb_encoding_decode_finish_single(&decode); len = lxb_encoding_encode_finish_single(&encode, &out, out_end); ``` -These finalization steps ensure that any remaining data is processed and that resources are cleaned up properly before the program exits. +These finalization steps ensure that any remaining data is processed and that +resources are cleaned up properly before the program exits. ## Conclusion -The `from_to.c` example illustrates a practical approach to character encoding conversion using the Lexbor encoding library. It showcases error handling, user guidance, and processing loops, making it a valuable reference for developers needing to handle various text encodings in their applications. This example emphasizes the importance of robust input handling and clean output generation within character encoding operations. \ No newline at end of file +The `from_to.c` example illustrates a practical approach to character encoding +conversion using the Lexbor encoding library. It showcases error handling, user +guidance, and processing loops, making it a valuable reference for developers +needing to handle various text encodings in their applications. This example +emphasizes the importance of robust input handling and clean output generation +within character encoding operations. \ No newline at end of file diff --git a/source/examples/html/document_parse.md b/source/examples/html/document_parse.md index e10b793..5326468 100644 --- a/source/examples/html/document_parse.md +++ b/source/examples/html/document_parse.md @@ -1,21 +1,28 @@ # HTML Document Parsing Example -This article explains an example of parsing an HTML document using the Lexbor library. The purpose of this example, located in the source file [lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c), is to illustrate the steps necessary to create an HTML document, parse a string of HTML, and serialize the resulting DOM tree. +This article explains an example of parsing an HTML document using the Lexbor +library. The purpose of this example, located in the source file +[lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c), +is to illustrate the steps necessary to create an HTML document, parse a string +of HTML, and serialize the resulting DOM tree. ## Example Overview The example demonstrates the following key steps: 1. **Creating the HTML Document**: Initializing a new HTML document. -2. **Parsing the HTML**: Taking an HTML string and processing it to generate a DOM tree. -3. **Outputting the Results**: Printing the original HTML and the resulting DOM structure. +2. **Parsing the HTML**: Taking an HTML string and processing it to generate a + DOM tree. +3. **Outputting the Results**: Printing the original HTML and the resulting DOM + structure. 4. **Cleaning Up**: Destroying the document to free allocated resources. ## Code Explanation ### Main Function -The program starts in the `main` function, where it declares a variable for the document status and a pointer to the HTML document. +The program starts in the `main` function, where it declares a variable for the +document status and a pointer to the HTML document. ```c lxb_status_t status; @@ -24,7 +31,8 @@ lxb_html_document_t *document; ### Defining HTML Content -A static character array contains the HTML to be parsed. The length of this HTML string is also calculated. +A static character array contains the HTML to be parsed. The length of this HTML +string is also calculated. ```c static const lxb_char_t html[] = "

blah-blah-blah

"; @@ -33,7 +41,9 @@ size_t html_len = sizeof(html) - 1; ### Document Initialization -The next segment involves initializing a new HTML document using the `lxb_html_document_create` function. This function allocates necessary memory and sets up internal structures to hold the document data. +The next segment involves initializing a new HTML document using the +`lxb_html_document_create` function. This function allocates necessary memory +and sets up internal structures to hold the document data. ```c document = lxb_html_document_create(); @@ -42,11 +52,14 @@ if (document == NULL) { } ``` -If the document creation fails, an error message is printed, allowing for debugging. +If the document creation fails, an error message is printed, allowing for +debugging. ### HTML Parsing -Once the document is created, the program parses the HTML content. The `lxb_html_document_parse` function is responsible for parsing the input HTML string. +Once the document is created, the program parses the HTML content. The +`lxb_html_document_parse` function is responsible for parsing the input HTML +string. ```c status = lxb_html_document_parse(document, html, html_len); @@ -55,36 +68,48 @@ if (status != LXB_STATUS_OK) { } ``` -If the status indicates a failure, an appropriate message is shown. This rigorous checking ensures that errors during parsing do not go unnoticed. +If the status indicates a failure, an appropriate message is shown. This +rigorous checking ensures that errors during parsing do not go unnoticed. ### Output the Results -After successfully parsing the HTML, the program prints the original HTML string and serializes the resulting DOM tree. The `PRINT` macro is used for outputting the HTML content. +After successfully parsing the HTML, the program prints the original HTML string +and serializes the resulting DOM tree. The `PRINT` macro is used for outputting +the HTML content. ```c PRINT("HTML:"); PRINT("%s", (const char *) html); ``` -It then calls a serialization function to visualize the structure of the parsed HTML document: +It then calls a serialization function to visualize the structure of the parsed +HTML document: ```c PRINT("\nHTML Tree:"); serialize(lxb_dom_interface_node(document)); ``` -This step helps developers understand how the HTML input is translated into a DOM tree structure, which is crucial for many web development tasks. +This step helps developers understand how the HTML input is translated into a +DOM tree structure, which is crucial for many web development tasks. ### Document Cleanup -Finally, the program cleans up by destroying the HTML document to avoid memory leaks. This is done using the `lxb_html_document_destroy` function: +Finally, the program cleans up by destroying the HTML document to avoid memory +leaks. This is done using the `lxb_html_document_destroy` function: ```c lxb_html_document_destroy(document); ``` -Ensuring proper resource management is important in C programming, as it helps maintain system performance and stability. +Ensuring proper resource management is important in C programming, as it helps +maintain system performance and stability. ## Conclusion -The example provided in [lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c) serves as a clear demonstration of how to create, parse, and handle an HTML document using Lexbor. Through careful initialization, parsing, result outputting, and cleanup, this code illustrates best practices for managing HTML documents in a C environment. \ No newline at end of file +The example provided in +[lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c) +serves as a clear demonstration of how to create, parse, and handle an HTML +document using Lexbor. Through careful initialization, parsing, result +outputting, and cleanup, this code illustrates best practices for managing HTML +documents in a C environment. \ No newline at end of file diff --git a/source/examples/html/document_parse_chunk.md b/source/examples/html/document_parse_chunk.md index 1525ff3..b5fc7a9 100644 --- a/source/examples/html/document_parse_chunk.md +++ b/source/examples/html/document_parse_chunk.md @@ -1,20 +1,32 @@ # HTML Document Parsing Example -This article provides an overview of an example implementation of HTML document parsing using the Lexbor library. The example is located in the source file [lexbor/html/document_parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse_chunk.c). This example demonstrates how to create an HTML document, parse it in chunks, and handle the cleaning up of allocated resources. +This article provides an overview of an example implementation of HTML document +parsing using the Lexbor library. The example is located in the source file +[lexbor/html/document_parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse_chunk.c). +This example demonstrates how to create an HTML document, parse it in chunks, +and handle the cleaning up of allocated resources. ## Code Overview -The primary function of the code is to illustrate how to process HTML content in segments, allowing for a more flexible parsing technique suitable for scenarios where full documents may not be available in one piece. This chunk-based parsing can be particularly useful for streaming applications or when handling very large HTML documents. +The primary function of the code is to illustrate how to process HTML content in +segments, allowing for a more flexible parsing technique suitable for scenarios +where full documents may not be available in one piece. This chunk-based parsing +can be particularly useful for streaming applications or when handling very +large HTML documents. ### Initialization -At the beginning of the `main` function, several essential variables are declared, including a status variable of type `lxb_status_t` and a pointer to a `lxb_html_document_t`, which will represent our HTML document. +At the beginning of the `main` function, several essential variables are +declared, including a status variable of type `lxb_status_t` and a pointer to a +`lxb_html_document_t`, which will represent our HTML document. ```c lxb_html_document_t *document; ``` -The `lxb_html_document_create()` function is called to create an instance of the HTML document. It is essential to check whether the document was created successfully. +The `lxb_html_document_create()` function is called to create an instance of the +HTML document. It is essential to check whether the document was created +successfully. ```c document = lxb_html_document_create(); @@ -27,7 +39,9 @@ If the document creation fails, the program will exit, indicating an error. ### Parsing HTML Chunks -The HTML content is stored in a two-dimensional array of characters. Each string represents a fragment of the HTML document. The fragments are designed to be combined later to form a complete HTML structure. +The HTML content is stored in a two-dimensional array of characters. Each string +represents a fragment of the HTML document. The fragments are designed to be +combined later to form a complete HTML structure. ```c static const lxb_char_t html[][64] = { @@ -37,13 +51,19 @@ static const lxb_char_t html[][64] = { }; ``` -After setting up the document, the code initiates the parsing process by calling `lxb_html_document_parse_chunk_begin()`, which prepares the document to accept incoming chunks of HTML. +After setting up the document, the code initiates the parsing process by calling +`lxb_html_document_parse_chunk_begin()`, which prepares the document to accept +incoming chunks of HTML. ```c status = lxb_html_document_parse_chunk_begin(document); ``` -The program then enters a loop that iterates over each HTML chunk until it reaches a null-terminating character. For each chunk, it prints the chunk content and attempts to parse it using `lxb_html_document_parse_chunk()`. This function takes the current HTML chunk and its length as input, returning a status that indicates success or failure. +The program then enters a loop that iterates over each HTML chunk until it +reaches a null-terminating character. For each chunk, it prints the chunk +content and attempts to parse it using `lxb_html_document_parse_chunk()`. This +function takes the current HTML chunk and its length as input, returning a +status that indicates success or failure. ```c for (size_t i = 0; html[i][0] != '\0'; i++) { @@ -57,11 +77,14 @@ for (size_t i = 0; html[i][0] != '\0'; i++) { } ``` -If any chunk fails to parse correctly, the program will exit with an error message. +If any chunk fails to parse correctly, the program will exit with an error +message. ### Finalization -After processing all HTML chunks, the end of the parsing process is signaled with the call to `lxb_html_document_parse_chunk_end()`. This function finalizes the parsing operation and validates the final structure of the document. +After processing all HTML chunks, the end of the parsing process is signaled +with the call to `lxb_html_document_parse_chunk_end()`. This function finalizes +the parsing operation and validates the final structure of the document. ```c status = lxb_html_document_parse_chunk_end(document); @@ -72,7 +95,9 @@ if (status != LXB_STATUS_OK) { ### Printing Results -Once parsing is complete, the example demonstrates how to serialize the resulting HTML DOM tree using the `serialize()` function, allowing the user to see the structured representation of the parsed HTML content. +Once parsing is complete, the example demonstrates how to serialize the +resulting HTML DOM tree using the `serialize()` function, allowing the user to +see the structured representation of the parsed HTML content. ```c PRINT("\nHTML Tree:"); @@ -81,7 +106,9 @@ serialize(lxb_dom_interface_node(document)); ### Cleanup -Finally, the document is destroyed using `lxb_html_document_destroy()`, which frees the allocated memory associated with the HTML document instance. This resource management step is crucial in avoiding memory leaks. +Finally, the document is destroyed using `lxb_html_document_destroy()`, which +frees the allocated memory associated with the HTML document instance. This +resource management step is crucial in avoiding memory leaks. ```c lxb_html_document_destroy(document); @@ -89,4 +116,8 @@ lxb_html_document_destroy(document); ## Conclusion -This example effectively illustrates how to use Lexbor for HTML document parsing in a chunked manner. The structure and logic of the code provide a solid foundation for more advanced HTML processing applications. It encapsulates essential operations such as initialization, incremental parsing, result extraction, and cleanup in a clear and easy-to-follow manner. \ No newline at end of file +This example effectively illustrates how to use Lexbor for HTML document parsing +in a chunked manner. The structure and logic of the code provide a solid +foundation for more advanced HTML processing applications. It encapsulates +essential operations such as initialization, incremental parsing, result +extraction, and cleanup in a clear and easy-to-follow manner. \ No newline at end of file diff --git a/source/examples/html/document_title.md b/source/examples/html/document_title.md index e1f1c49..411e1a1 100644 --- a/source/examples/html/document_title.md +++ b/source/examples/html/document_title.md @@ -1,17 +1,27 @@ # HTML Document Title Example -This article will explain the functionality of the HTML document title example implemented in the source code found in [lexbor/html/document_title.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_title.c). The purpose of this code is to demonstrate how to parse an HTML string, retrieve its title, modify the title, and then display the resulting HTML document structure using the Lexbor library. +This article will explain the functionality of the HTML document title example +implemented in the source code found in +[lexbor/html/document_title.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_title.c). +The purpose of this code is to demonstrate how to parse an HTML string, retrieve +its title, modify the title, and then display the resulting HTML document +structure using the Lexbor library. ## Code Breakdown ### Initialization -The code begins with the inclusion of the required headers and the setup of the `main` function, which is the entry point of the program. Here, the main task involves creating an HTML document instance and specifying the necessary variables. +The code begins with the inclusion of the required headers and the setup of the +`main` function, which is the entry point of the program. Here, the main task +involves creating an HTML document instance and specifying the necessary +variables. ```c lxb_html_document_t *document; ``` -This line declares a pointer to an `lxb_html_document_t` structure which represents the HTML document being created. The succeeding lines define variables for storing the title and its length. +This line declares a pointer to an `lxb_html_document_t` structure which +represents the HTML document being created. The succeeding lines define +variables for storing the title and its length. ### Creating the Document @@ -23,11 +33,14 @@ if (document == NULL) { FAILED("Failed to create HTML Document"); } ``` -In this snippet, the `lxb_html_document_create` function is called to allocate memory for a new HTML document. If the document fails to create, the program invokes the `FAILED` macro to signal an error. +In this snippet, the `lxb_html_document_create` function is called to allocate +memory for a new HTML document. If the document fails to create, the program +invokes the `FAILED` macro to signal an error. ### Parsing HTML -After successfully creating the document, the code proceeds to parse the HTML string: +After successfully creating the document, the code proceeds to parse the HTML +string: ```c status = lxb_html_document_parse(document, html, html_len); @@ -35,7 +48,9 @@ if (status != LXB_STATUS_OK) { FAILED("Failed to parse HTML"); } ``` -Here, the HTML content defined in the `html` array—specifically the title tag which contains extra spaces—is parsed. The variable `status` checks if the operation was successful. If not, the program exits with an error message. +Here, the HTML content defined in the `html` array—specifically the title tag +which contains extra spaces—is parsed. The variable `status` checks if the +operation was successful. If not, the program exits with an error message. ### Retrieving the Title @@ -44,16 +59,21 @@ Once the document is parsed, the code retrieves the title of the document: ```c title = lxb_html_document_title(document, &title_len); ``` -This function call extracts the title text from the document, storing it into the `title` variable. The length of the title is also provided through the `title_len` reference. The subsequent `if` statement checks whether the title exists, printing the title or an empty message accordingly. +This function call extracts the title text from the document, storing it into +the `title` variable. The length of the title is also provided through the +`title_len` reference. The subsequent `if` statement checks whether the title +exists, printing the title or an empty message accordingly. ### Obtaining the Raw Title -The following code retrieves the raw title, which includes the original formatting (e.g., extra spaces): +The following code retrieves the raw title, which includes the original +formatting (e.g., extra spaces): ```c title = lxb_html_document_title_raw(document, &title_len); ``` -Much like the previous title retrieval, this extracts the unformatted title, allowing a comparison between the cleaned and raw titles. +Much like the previous title retrieval, this extracts the unformatted title, +allowing a comparison between the cleaned and raw titles. ### Modifying the Title @@ -65,20 +85,26 @@ if (status != LXB_STATUS_OK) { FAILED("Failed to change HTML title"); } ``` -By invoking `lxb_html_document_title_set`, the title is altered to a new value defined by the `new_title` variable. An error check follows to ensure the title change was successful. +By invoking `lxb_html_document_title_set`, the title is altered to a new value +defined by the `new_title` variable. An error check follows to ensure the title +change was successful. ### Displaying the New Title and HTML Structure -The final steps involve displaying the updated title and the entire HTML document structure after modification: +The final steps involve displaying the updated title and the entire HTML +document structure after modification: ```c title = lxb_html_document_title(document, &title_len); ``` -This repeats the earlier title retrieval process to print the new title. Finally, the code prints the altered HTML structure to show the impact of the title change. +This repeats the earlier title retrieval process to print the new title. +Finally, the code prints the altered HTML structure to show the impact of the +title change. ### Cleanup -Lastly, the document is destroyed to free the allocated memory, which is crucial for preventing memory leaks: +Lastly, the document is destroyed to free the allocated memory, which is crucial +for preventing memory leaks: ```c lxb_html_document_destroy(document); @@ -86,4 +112,10 @@ lxb_html_document_destroy(document); ## Conclusion -This example illustrates the basic operations for handling HTML document titles using the Lexbor library, including parsing content, accessing and modifying the title, and ensuring proper resource management. The structure of the code is straightforward, aiming to provide a clear understanding of each step involved in managing an HTML document's title. As developers familiarize themselves with the functionalities offered by Lexbor, they will be better equipped to manipulate HTML content programmatically. \ No newline at end of file +This example illustrates the basic operations for handling HTML document titles +using the Lexbor library, including parsing content, accessing and modifying the +title, and ensuring proper resource management. The structure of the code is +straightforward, aiming to provide a clear understanding of each step involved +in managing an HTML document's title. As developers familiarize themselves with +the functionalities offered by Lexbor, they will be better equipped to +manipulate HTML content programmatically. \ No newline at end of file diff --git a/source/examples/html/element_attributes.md b/source/examples/html/element_attributes.md index 8f01734..c8cb12c 100644 --- a/source/examples/html/element_attributes.md +++ b/source/examples/html/element_attributes.md @@ -1,10 +1,19 @@ # Element Attributes Example -This article explains the implementation found in [lexbor/html/element_attributes.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_attributes.c), which demonstrates how to manipulate HTML element attributes using the Lexbor library. The example outlines parsing an HTML snippet, finding an element, and performing various operations involving element attributes, such as adding, checking existence, retrieving, modifying, and removing attributes from an element. +This article explains the implementation found in +[lexbor/html/element_attributes.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_attributes.c), +which demonstrates how to manipulate HTML element attributes using the Lexbor +library. The example outlines parsing an HTML snippet, finding an element, and +performing various operations involving element attributes, such as adding, +checking existence, retrieving, modifying, and removing attributes from an +element. ## Code Overview -The code begins by including necessary headers and defining the main function, which initializes variables for handling the document and its components. The use of `lxb_status_t` for tracking the status of operations is essential throughout the code. +The code begins by including necessary headers and defining the main function, +which initializes variables for handling the document and its components. The +use of `lxb_status_t` for tracking the status of operations is essential +throughout the code. ### HTML Parsing @@ -20,7 +29,9 @@ A document is parsed from this HTML string with: document = parse(html, html_len); ``` -After parsing, the code outputs the structure of the DOM tree to the console using a `serialize` function, allowing developers to visualize the parsed HTML elements. +After parsing, the code outputs the structure of the DOM tree to the console +using a `serialize` function, allowing developers to visualize the parsed HTML +elements. ### Collection Creation @@ -30,17 +41,21 @@ Next, a DOM collection is created to hold references to found elements: collection = lxb_dom_collection_make(&document->dom_document, 16); ``` -If the collection creation fails, an error message is printed, and the program exits. +If the collection creation fails, an error message is printed, and the program +exits. ### Searching for Elements -To find the `
` element in the DOM, the code first obtains the body element and then calls: +To find the `
` element in the DOM, the code first obtains the body element +and then calls: ```c status = lxb_dom_elements_by_tag_name(element, collection, (const lxb_char_t *) "div", 3); ``` -This line searches for all `
` elements under the specified parent element. A check for successful status and the collection's length follows, ensuring that at least one `
` is found. +This line searches for all `
` elements under the specified parent element. +A check for successful status and the collection's length follows, ensuring that +at least one `
` is found. ### Adding an Attribute @@ -50,7 +65,8 @@ Once the element is identified, a new attribute is added using: attr = lxb_dom_element_set_attribute(element, name, name_size, (const lxb_char_t *) "oh God", 6); ``` -In this case, the attribute named "my-name" is appended with a value of "oh God." If the attribute creation fails, an error message is displayed. +In this case, the attribute named "my-name" is appended with a value of "oh +God." If the attribute creation fails, an error message is displayed. ### Checking Attribute Existence @@ -60,7 +76,8 @@ The program checks if the newly added attribute exists: is_exist = lxb_dom_element_has_attribute(element, name, name_size); ``` -A printed message confirms whether the attribute is present or not based on the check. +A printed message confirms whether the attribute is present or not based on the +check. ### Retrieving Attribute Values @@ -80,11 +97,13 @@ The code then demonstrates how to iterate through all attributes of the element: attr = lxb_dom_element_first_attribute(element); ``` -This iterates through attributes using a `while` loop, printing each attribute's name and value until there are no more attributes in the collection. +This iterates through attributes using a `while` loop, printing each attribute's +name and value until there are no more attributes in the collection. ### Modifying an Attribute Value -To change the value of an existing attribute, the code retrieves the attribute by name: +To change the value of an existing attribute, the code retrieves the attribute +by name: ```c attr = lxb_dom_element_attr_by_name(element, name, name_size); @@ -104,11 +123,13 @@ Finally, the example concludes with the removal of the newly added attribute: lxb_dom_element_remove_attribute(element, name, name_size); ``` -This operation is followed by a serialized output of the DOM tree again, allowing the developer to observe changes. +This operation is followed by a serialized output of the DOM tree again, +allowing the developer to observe changes. ### Cleanup -The code ensures proper resource management by destroying the collection and the document at the end of the main function to prevent memory leaks: +The code ensures proper resource management by destroying the collection and the +document at the end of the main function to prevent memory leaks: ```c lxb_dom_collection_destroy(collection, true); @@ -117,4 +138,8 @@ lxb_html_document_destroy(document); ## Conclusion -The `element_attributes.c` example illustrates fundamental operations in DOM manipulation provided by the Lexbor library. The code efficiently demonstrates how to parse HTML, locate and manipulate elements, manage attributes, and ensure appropriate cleanup of resources, making it a valuable reference for web developers working with the Lexbor framework. \ No newline at end of file +The `element_attributes.c` example illustrates fundamental operations in DOM +manipulation provided by the Lexbor library. The code efficiently demonstrates +how to parse HTML, locate and manipulate elements, manage attributes, and ensure +appropriate cleanup of resources, making it a valuable reference for web +developers working with the Lexbor framework. \ No newline at end of file diff --git a/source/examples/html/element_create.md b/source/examples/html/element_create.md index 58ca337..9de557f 100644 --- a/source/examples/html/element_create.md +++ b/source/examples/html/element_create.md @@ -1,37 +1,56 @@ # HTML Element Creation Example -This article explains the implementation of creating and appending HTML elements in a document using the respective Lexbor library. The example provided is from the source file [lexbor/html/element_create.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_create.c). +This article explains the implementation of creating and appending HTML elements +in a document using the respective Lexbor library. The example provided is from +the source file +[lexbor/html/element_create.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_create.c). ## Introduction -The code demonstrates how to initialize an HTML document, create various HTML elements using their tag IDs, and manage them within a document structure. The main functionalities utilized include parsing an empty HTML document, creating elements, and preserving the overall tree structure through serialization. +The code demonstrates how to initialize an HTML document, create various HTML +elements using their tag IDs, and manage them within a document structure. The +main functionalities utilized include parsing an empty HTML document, creating +elements, and preserving the overall tree structure through serialization. ## Code Overview -1. **Initialization**: - The code begins with the necessary includes and the definition of the `main` function. It declares necessary pointers to hold the document, body element, and tags. +1. **Initialization**: The code begins with the necessary includes and the + definition of the `main` function. It declares necessary pointers to hold the + document, body element, and tags. -2. **Parse Document**: - The function `parse` is called with an empty string, initializing an HTML document. This is essential for setting up a base where elements can be created and manipulated. +2. **Parse Document**: The function `parse` is called with an empty string, + initializing an HTML document. This is essential for setting up a base where + elements can be created and manipulated. -3. **Accessing the Body Element**: - The body of the document is obtained using `lxb_html_document_body_element(document)`, allowing further manipulations to be performed on this node. +3. **Accessing the Body Element**: The body of the document is obtained using + `lxb_html_document_body_element(document)`, allowing further manipulations to + be performed on this node. -4. **Creating Elements**: - A loop iterates over all tag IDs defined by the Lexbor library, from `LXB_TAG_A` to `LXB_TAG__LAST_ENTRY`. For each tag: +4. **Creating Elements**: A loop iterates over all tag IDs defined by the Lexbor + library, from `LXB_TAG_A` to `LXB_TAG__LAST_ENTRY`. For each tag: - The tag name is retrieved using `lxb_tag_name_by_id`. - - An element is created with `lxb_dom_document_create_element`. This function constructs the DOM element based on the tag name. - - If the tag is identified as void (such as `
` or ``), it is created without a text node. Conversely, non-void tags generate text nodes through `lxb_dom_document_create_text_node`, allowing text content to be associated with those elements. + - An element is created with `lxb_dom_document_create_element`. This function + constructs the DOM element based on the tag name. + - If the tag is identified as void (such as `
` or ``), it is created + without a text node. Conversely, non-void tags generate text nodes through + `lxb_dom_document_create_text_node`, allowing text content to be associated + with those elements. -5. **Inserting Elements into the Tree**: - Each created element is serialized for output and then inserted into the body of the document using `lxb_dom_node_insert_child`. +5. **Inserting Elements into the Tree**: Each created element is serialized for + output and then inserted into the body of the document using + `lxb_dom_node_insert_child`. -6. **Final Output**: - After all elements are created and appended, the updated document tree is printed to show the result of the insertions. +6. **Final Output**: After all elements are created and appended, the updated + document tree is printed to show the result of the insertions. -7. **Cleanup**: - Finally, the allocated document is destroyed using `lxb_html_document_destroy` to prevent memory leaks. +7. **Cleanup**: Finally, the allocated document is destroyed using + `lxb_html_document_destroy` to prevent memory leaks. ## Conclusion -This program effectively showcases the process of dynamically creating HTML elements using the Lexbor library. It covers the aspects of parsing, element creation, manipulation, and serialization, providing an essential toolkit for developers looking to work with HTML structures programmatically. The inclusion of error handling ensures reliability, allowing developers to catch and address potential issues during element creation. \ No newline at end of file +This program effectively showcases the process of dynamically creating HTML +elements using the Lexbor library. It covers the aspects of parsing, element +creation, manipulation, and serialization, providing an essential toolkit for +developers looking to work with HTML structures programmatically. The inclusion +of error handling ensures reliability, allowing developers to catch and address +potential issues during element creation. \ No newline at end of file diff --git a/source/examples/html/element_innerHTML.md b/source/examples/html/element_innerHTML.md index aeef2d8..00a4c4f 100644 --- a/source/examples/html/element_innerHTML.md +++ b/source/examples/html/element_innerHTML.md @@ -1,24 +1,38 @@ # Setting innerHTML Example -This article will explain the `innerHTML` manipulation in the context of the Lexbor HTML parser, as illustrated in the source file [lexbor/html/element_innerHTML.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_innerHTML.c). This example demonstrates how to parse HTML content, modify an element's inner HTML, and serialize the result. +This article will explain the `innerHTML` manipulation in the context of the +Lexbor HTML parser, as illustrated in the source file +[lexbor/html/element_innerHTML.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_innerHTML.c). +This example demonstrates how to parse HTML content, modify an element's inner +HTML, and serialize the result. ## Code Overview -The code starts with the inclusion of the necessary header file, `base.h`, which likely contains the essential definitions and functions for the Lexbor library. The `main` function serves as the entry point for the execution of this program. +The code starts with the inclusion of the necessary header file, `base.h`, which +likely contains the essential definitions and functions for the Lexbor library. +The `main` function serves as the entry point for the execution of this program. ### HTML Parsing -The program begins by defining a simple HTML string containing a `
` with a nested `` element. The length of this string is calculated using `sizeof(html) - 1` to exclude the null terminator from the count. The predefined HTML string is as follows: +The program begins by defining a simple HTML string containing a `
` with a +nested `` element. The length of this string is calculated using +`sizeof(html) - 1` to exclude the null terminator from the count. The predefined +HTML string is as follows: ```c static const lxb_char_t html[] = "
blah-blah-blah
"; ``` -Next, the `parse` function is called with the HTML string and its length. This function processes the HTML and generates a document object model (DOM), representing the structure of the HTML document in memory. +Next, the `parse` function is called with the HTML string and its length. This +function processes the HTML and generates a document object model (DOM), +representing the structure of the HTML document in memory. ### Printing the Parsed HTML -The program checks the output of the `parse` function and prints the original HTML and the resulting DOM tree. This is accomplished with the `PRINT` macro, which appears to be a utility for outputting messages. The serialized DOM is obtained using the `serialize` function on the document's root node: +The program checks the output of the `parse` function and prints the original +HTML and the resulting DOM tree. This is accomplished with the `PRINT` macro, +which appears to be a utility for outputting messages. The serialized DOM is +obtained using the `serialize` function on the document's root node: ```c PRINT("HTML:"); @@ -29,24 +43,32 @@ serialize(lxb_dom_interface_node(document)); ### Inner HTML Modification -Subsequently, a second HTML string is defined, which will be set as the inner HTML of the body element. This inner HTML is specified as follows: +Subsequently, a second HTML string is defined, which will be set as the inner +HTML of the body element. This inner HTML is specified as follows: ```c static const lxb_char_t inner[] = "
  • 1
  • 2
  • 3
"; ``` -The program retrieves the body element of the document using `lxb_html_document_body_element(document)`. The inner HTML of the body is then set using the `lxb_html_element_inner_html_set` function, which takes the body element and the inner HTML string along with its length as arguments: +The program retrieves the body element of the document using +`lxb_html_document_body_element(document)`. The inner HTML of the body is then +set using the `lxb_html_element_inner_html_set` function, which takes the body +element and the inner HTML string along with its length as arguments: ```c element = lxb_html_element_inner_html_set(lxb_html_interface_element(body), inner, inner_len); ``` -If the `element` is `NULL`, indicating a failure in setting the inner HTML, a failure message is printed through the `FAILED` macro. +If the `element` is `NULL`, indicating a failure in setting the inner HTML, a +failure message is printed through the `FAILED` macro. ### Final Output -After setting the inner HTML, the program serializes the modified DOM tree and prints the result. This demonstrates the changes made by the inner HTML operation. Finally, the code cleans up by destroying the document to release resources allocated for the DOM. +After setting the inner HTML, the program serializes the modified DOM tree and +prints the result. This demonstrates the changes made by the inner HTML +operation. Finally, the code cleans up by destroying the document to release +resources allocated for the DOM. ```c PRINT("\nTree after innerHTML set:"); @@ -56,4 +78,8 @@ lxb_html_document_destroy(document); ## Conclusion -The example provided illustrates how to parse an HTML string, modify an element's inner HTML content, and serialize the resulting DOM structure using Lexbor's capabilities. This demonstrates an essential functionality often used in web development for DOM manipulation, showcasing the ease of use of the Lexbor library for such tasks. \ No newline at end of file +The example provided illustrates how to parse an HTML string, modify an +element's inner HTML content, and serialize the resulting DOM structure using +Lexbor's capabilities. This demonstrates an essential functionality often used +in web development for DOM manipulation, showcasing the ease of use of the +Lexbor library for such tasks. \ No newline at end of file diff --git a/source/examples/html/elements_by_attr.md b/source/examples/html/elements_by_attr.md index 0a8aeab..b7300fe 100644 --- a/source/examples/html/elements_by_attr.md +++ b/source/examples/html/elements_by_attr.md @@ -1,10 +1,15 @@ # Retrieving Elements by Attribute Example -This article will explain the functionality and implementation of the code found in **lexbor/html/elements_by_attr.c**, which demonstrates how to retrieve DOM elements based on specific attributes using the lexbor library. +This article will explain the functionality and implementation of the code found +in **lexbor/html/elements_by_attr.c**, which demonstrates how to retrieve DOM +elements based on specific attributes using the lexbor library. ## Overview -The provided code showcases how to extract elements from an HTML document based on their attributes. It specifically focuses on obtaining elements by 'class' and 'href' attributes, employing methods that match, search from the beginning, and search from the end of the attribute values. +The provided code showcases how to extract elements from an HTML document based +on their attributes. It specifically focuses on obtaining elements by 'class' +and 'href' attributes, employing methods that match, search from the beginning, +and search from the end of the attribute values. ## Code Breakdown @@ -17,17 +22,22 @@ The code starts with including essential headers: #include ``` -The `base.h` header seems to contain definitions and functions crucial for this example, while `lexbor/dom/dom.h` provides the necessary DOM manipulations for lexbor. +The `base.h` header seems to contain definitions and functions crucial for this +example, while `lexbor/dom/dom.h` provides the necessary DOM manipulations for +lexbor. ### Print Collection Function -The function `print_collection_elements` is defined to handle the output of the retrieved elements: +The function `print_collection_elements` is defined to handle the output of the +retrieved elements: ```c static void print_collection_elements(lxb_dom_collection_t *collection) ``` -This function loops through the elements within the provided collection using its length and utilizes the `serialize_node` function to print each element. After processing, it ensures to clean up the collection to prevent memory leaks. +This function loops through the elements within the provided collection using +its length and utilizes the `serialize_node` function to print each element. +After processing, it ensures to clean up the collection to prevent memory leaks. ### Main Function Execution @@ -49,7 +59,8 @@ const lxb_char_t html[] = "
" "
"; ``` -This string contains several `
` and `` tags with diverse class attributes and an `href`. The length of this HTML string is then calculated. +This string contains several `
` and `` tags with diverse class +attributes and an `href`. The length of this HTML string is then calculated. #### Creating Document and Collection @@ -59,7 +70,8 @@ Following that, the HTML is parsed, creating a document object: document = parse(html, html_szie); ``` -Next, a collection object is created that will hold the elements found based on the attribute queries: +Next, a collection object is created that will hold the elements found based on +the attribute queries: ```c collection = lxb_dom_collection_make(&document->dom_document, 128); @@ -71,8 +83,8 @@ A check is performed to ensure that the collection was created successfully. The program performs several searches: -1. **Full Match:** - Using `lxb_dom_elements_by_attr`, it searches for elements with the exact class `red c++ best`: +1. **Full Match:** Using `lxb_dom_elements_by_attr`, it searches for elements + with the exact class `red c++ best`: ```c status = lxb_dom_elements_by_attr(body, collection, @@ -83,8 +95,8 @@ The program performs several searches: If the search is successful, the found elements are printed. -2. **From Beginning:** - The code retrieves elements with an `href` that starts with `http`: +2. **From Beginning:** The code retrieves elements with an `href` that starts + with `http`: ```c status = lxb_dom_elements_by_attr_begin(body, collection, @@ -93,8 +105,7 @@ The program performs several searches: true); ``` -3. **From End:** - This search targets elements with classes ending in `grep`: +3. **From End:** This search targets elements with classes ending in `grep`: ```c status = lxb_dom_elements_by_attr_end(body, collection, @@ -103,8 +114,8 @@ The program performs several searches: true); ``` -4. **Contain:** - Finally, it looks for elements where the class contains the substring `c++ b`: +4. **Contain:** Finally, it looks for elements where the class contains the + substring `c++ b`: ```c status = lxb_dom_elements_by_attr_contain(body, collection, @@ -113,11 +124,13 @@ The program performs several searches: true); ``` -Each of these searches utilizes the collection to retrieve relevant elements, printing them as they are found. +Each of these searches utilizes the collection to retrieve relevant elements, +printing them as they are found. #### Cleanup -After the searches, cleanup processes are executed to free the allocated resources: +After the searches, cleanup processes are executed to free the allocated +resources: ```c lxb_dom_collection_destroy(collection, true); @@ -128,4 +141,8 @@ This is critical for maintaining memory hygiene in C programs. ## Conclusion -This code snippet demonstrates how to efficiently query and manipulate DOM elements in an HTML document using the lexbor library. By utilizing various search strategies based on attributes, developers can effectively streamline their DOM interactions, showcasing the flexibility and power of the lexbor library for handling HTML content. \ No newline at end of file +This code snippet demonstrates how to efficiently query and manipulate DOM +elements in an HTML document using the lexbor library. By utilizing various +search strategies based on attributes, developers can effectively streamline +their DOM interactions, showcasing the flexibility and power of the lexbor +library for handling HTML content. \ No newline at end of file diff --git a/source/examples/html/elements_by_class_name.md b/source/examples/html/elements_by_class_name.md index 4f6f10d..47d3e0c 100644 --- a/source/examples/html/elements_by_class_name.md +++ b/source/examples/html/elements_by_class_name.md @@ -1,10 +1,19 @@ # Getting Elements by Class Name Example -In this article, we will explore the implementation details and functionality of the `elements_by_class_name` example, found in the [lexbor/html/elements_by_class_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_class_name.c) source file. The code demonstrates how to parse an HTML string and retrieve elements with a specific class name using the lexbor library. This example is essential for developers seeking to manipulate and query DOM elements in a structured manner. +In this article, we will explore the implementation details and functionality of +the `elements_by_class_name` example, found in the +[lexbor/html/elements_by_class_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_class_name.c) +source file. The code demonstrates how to parse an HTML string and retrieve +elements with a specific class name using the lexbor library. This example is +essential for developers seeking to manipulate and query DOM elements in a +structured manner. ## Overview -The `main` function begins by initializing variables, including `status`, `element`, `document`, and `collection`. It then assigns an HTML string to the `html` variable, which contains multiple `
` elements with various class names. The length of the HTML string is calculated and stored in `html_size`. +The `main` function begins by initializing variables, including `status`, +`element`, `document`, and `collection`. It then assigns an HTML string to the +`html` variable, which contains multiple `
` elements with various class +names. The length of the HTML string is calculated and stored in `html_size`. ```c const lxb_char_t html[] = "
" @@ -17,7 +26,9 @@ size_t html_size = sizeof(html) - 1; ## Parsing the HTML Document -Next, the code invokes the `parse` function to parse the HTML string and create a DOM document. This document serves as the basis for subsequent operations on the DOM elements contained within the HTML. +Next, the code invokes the `parse` function to parse the HTML string and create +a DOM document. This document serves as the basis for subsequent operations on +the DOM elements contained within the HTML. ```c document = parse(html, html_size); @@ -25,7 +36,10 @@ document = parse(html, html_size); ## Creating a Collection for DOM Elements -Once the document is obtained, the next step is to create a collection to hold the elements retrieved by class name. The `lxb_dom_collection_make` function is called with the document's DOM and an initial capacity of 128. If the collection cannot be created, an error message is triggered. +Once the document is obtained, the next step is to create a collection to hold +the elements retrieved by class name. The `lxb_dom_collection_make` function is +called with the document's DOM and an initial capacity of 128. If the collection +cannot be created, an error message is triggered. ```c collection = lxb_dom_collection_make(&document->dom_document, 128); @@ -36,7 +50,10 @@ if (collection == NULL) { ## Retrieving Elements by Class Name -The `lxb_dom_elements_by_class_name` function enables the search for elements with a specified class name. In this instance, it looks for elements with the class name "best". The function leverages the interface of the document's body to initiate the retrieval process and populate the `collection`. +The `lxb_dom_elements_by_class_name` function enables the search for elements +with a specified class name. In this instance, it looks for elements with the +class name "best". The function leverages the interface of the document's body +to initiate the retrieval process and populate the `collection`. ```c status = lxb_dom_elements_by_class_name(lxb_dom_interface_element(document->body), @@ -46,7 +63,8 @@ if (status != LXB_STATUS_OK) { } ``` -After ensuring the retrieval is successful, the code proceeds to print the original HTML and details about the found elements. +After ensuring the retrieval is successful, the code proceeds to print the +original HTML and details about the found elements. ```c PRINT("HTML:"); @@ -57,7 +75,9 @@ PRINT("Elements found:"); ## Serializing and Printing Found Elements -A loop iterates through the collection of found elements, invoking the `serialize_node` function to output each element's details. This demonstrates how easy it is to interact with the elements returned by the class name query. +A loop iterates through the collection of found elements, invoking the +`serialize_node` function to output each element's details. This demonstrates +how easy it is to interact with the elements returned by the class name query. ```c for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { @@ -68,7 +88,9 @@ for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { ## Cleanup -Finally, the `collection` and `document` are cleaned up to free allocated resources. This step is crucial for managing memory within the application, especially when dealing with large or complex documents. +Finally, the `collection` and `document` are cleaned up to free allocated +resources. This step is crucial for managing memory within the application, +especially when dealing with large or complex documents. ```c lxb_dom_collection_destroy(collection, true); @@ -77,4 +99,8 @@ lxb_html_document_destroy(document); ## Conclusion -The `elements_by_class_name` example illustrates how to use the lexbor library to parse HTML content, search for elements by class name, and efficiently manage those elements. The critical sections of the code demonstrate proper document handling, error management, and systematic cleanup, providing a solid foundation for developers exploring DOM manipulation within C. \ No newline at end of file +The `elements_by_class_name` example illustrates how to use the lexbor library +to parse HTML content, search for elements by class name, and efficiently manage +those elements. The critical sections of the code demonstrate proper document +handling, error management, and systematic cleanup, providing a solid foundation +for developers exploring DOM manipulation within C. \ No newline at end of file diff --git a/source/examples/html/elements_by_tag_name.md b/source/examples/html/elements_by_tag_name.md index 77c0485..e7c9be5 100644 --- a/source/examples/html/elements_by_tag_name.md +++ b/source/examples/html/elements_by_tag_name.md @@ -1,19 +1,28 @@ # HTML Elements by Tag Name Example -This article will explain the code found in the source file [lexbor/html/elements_by_tag_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_tag_name.c), which demonstrates how to find and print HTML elements by their tag names using the Lexbor DOM library. +This article will explain the code found in the source file +[lexbor/html/elements_by_tag_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_tag_name.c), +which demonstrates how to find and print HTML elements by their tag names using +the Lexbor DOM library. ## Code Overview -The purpose of this example is to parse a simple HTML string and retrieve all `
` elements from the parsed document. It achieves this by leveraging the Lexbor library's DOM capabilities to manage and manipulate the HTML document structure. +The purpose of this example is to parse a simple HTML string and retrieve all +`
` elements from the parsed document. It achieves this by leveraging the +Lexbor library's DOM capabilities to manage and manipulate the HTML document +structure. ## Main Function -The entry point of the program is the `main` function, which begins by declaring several variables essential for the parsing process: +The entry point of the program is the `main` function, which begins by declaring +several variables essential for the parsing process: - `status` stores the success or failure status of various operations. - `element` will point to the current HTML element being processed. -- `document` links to the HTML document that will be created from the parsed input. -- `collection` is intended to hold the collection of elements found in the document. +- `document` links to the HTML document that will be created from the parsed + input. +- `collection` is intended to hold the collection of elements found in the + document. ### Parsing HTML @@ -23,13 +32,15 @@ The HTML string defined as: const lxb_char_t html[] = "
"; ``` -represents a simple HTML fragment which contains two `
` elements and a `` element. The size of the HTML string is determined next: +represents a simple HTML fragment which contains two `
` elements and a +`` element. The size of the HTML string is determined next: ```c size_t html_size = sizeof(html) - 1; ``` -This allows the program to recognize the length of the string without including the null terminator. +This allows the program to recognize the length of the string without including +the null terminator. The `parse` function is then called to create a `document` from the HTML string: @@ -37,7 +48,8 @@ The `parse` function is then called to create a `document` from the HTML string: document = parse(html, html_size); ``` -This function interprets the HTML and constructs a corresponding DOM structure. The parsing outcome is crucial; it will dictate the next steps in the program. +This function interprets the HTML and constructs a corresponding DOM structure. +The parsing outcome is crucial; it will dictate the next steps in the program. ### Creating a DOM Collection @@ -47,7 +59,9 @@ A collection is created to hold the resulting nodes: collection = lxb_dom_collection_make(&document->dom_document, 128); ``` -This function attempts to allocate memory for a collection that can store up to 128 DOM elements. If memory allocation fails, the program exits with an error message: +This function attempts to allocate memory for a collection that can store up to +128 DOM elements. If memory allocation fails, the program exits with an error +message: ```c if (collection == NULL) { @@ -57,7 +71,8 @@ if (collection == NULL) { ### Retrieving Elements by Tag Name -The critical operation of this example is retrieving `
` elements from the document: +The critical operation of this example is retrieving `
` elements from the +document: ```c status = lxb_dom_elements_by_tag_name(lxb_dom_interface_element(document->body), @@ -67,7 +82,8 @@ status = lxb_dom_elements_by_tag_name(lxb_dom_interface_element(document->body), Here, `lxb_dom_elements_by_tag_name` takes three parameters: 1. The reference to the body of the document. 2. The collection object to store the found elements. -3. The string `"div"` along with its length, specifying which tags to search for. +3. The string `"div"` along with its length, specifying which tags to search + for. If the call is unsuccessful, it again exits with an error message: @@ -79,7 +95,8 @@ if (status != LXB_STATUS_OK) { ### Output the Found Elements -The program then prints the initial HTML string and displays a message indicating that it is about to list the found `
` elements: +The program then prints the initial HTML string and displays a message +indicating that it is about to list the found `
` elements: ```c PRINT("HTML:"); @@ -97,7 +114,8 @@ for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { } ``` -This loop retrieves each element from the collection by index and uses the `serialize_node` function to output its representation. +This loop retrieves each element from the collection by index and uses the +`serialize_node` function to output its representation. ### Cleanup @@ -108,8 +126,13 @@ lxb_dom_collection_destroy(collection, true); lxb_html_document_destroy(document); ``` -This ensures that there are no memory leaks after the program's execution is complete. +This ensures that there are no memory leaks after the program's execution is +complete. ## Conclusion -This example serves as a practical demonstration of how to use the Lexbor library to parse HTML and find elements by tag name. By using functions from the library's API, the code effectively processes a document and manages collections of elements, showcasing the utility of the Lexbor framework in web development tasks. \ No newline at end of file +This example serves as a practical demonstration of how to use the Lexbor +library to parse HTML and find elements by tag name. By using functions from the +library's API, the code effectively processes a document and manages collections +of elements, showcasing the utility of the Lexbor framework in web development +tasks. \ No newline at end of file diff --git a/source/examples/html/encoding.md b/source/examples/html/encoding.md index 526324a..eb5ea92 100644 --- a/source/examples/html/encoding.md +++ b/source/examples/html/encoding.md @@ -1,41 +1,84 @@ # HTML Encoding Example -This article provides an explanation for the HTML Encoding example found in the file [lexbor/html/encoding.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/encoding.c). This program is designed to read an HTML file, determine its character encoding, and print it out. The implementation utilizes the Lexbor library, which offers various functions to handle encoding. +This article provides an explanation for the HTML Encoding example found in the +file +[lexbor/html/encoding.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/encoding.c). +This program is designed to read an HTML file, determine its character encoding, +and print it out. The implementation utilizes the Lexbor library, which offers +various functions to handle encoding. ## Overview -The main function of the example handles command-line input, reads an HTML file, and determines its encoding using the Lexbor library. The code includes a failure handling mechanism and a usage function to guide users on how to execute the program properly. +The main function of the example handles command-line input, reads an HTML file, +and determines its encoding using the Lexbor library. The code includes a +failure handling mechanism and a usage function to guide users on how to execute +the program properly. ## Key Code Sections ### Error Handling Macro -The `FAILED` macro is a pivotal part of this code, providing a consistent way to handle errors throughout the program. It takes two parameters: a boolean flag `with_usage` and a variable number of arguments. If an error occurs, it prints the provided error message to the standard error stream and, if requested, displays the usage information before quitting the program. This helps keep the code clean while managing multiple error points effectively. +The `FAILED` macro is a pivotal part of this code, providing a consistent way to +handle errors throughout the program. It takes two parameters: a boolean flag +`with_usage` and a variable number of arguments. If an error occurs, it prints +the provided error message to the standard error stream and, if requested, +displays the usage information before quitting the program. This helps keep the +code clean while managing multiple error points effectively. ### Command-Line Arguments -In the `main` function, the program checks the number of command-line arguments passed to it. If the argument count does not equal 2, the program calls the `usage` function to provide instructions on how to execute the program correctly and then exits. This ensures that users understand how to use the program before any further processing occurs. +In the `main` function, the program checks the number of command-line arguments +passed to it. If the argument count does not equal 2, the program calls the +`usage` function to provide instructions on how to execute the program correctly +and then exits. This ensures that users understand how to use the program before +any further processing occurs. ### Reading the HTML File -The program reads the HTML file specified in the command-line argument using the `lexbor_fs_file_easy_read` function. It stores the content in a dynamic array and checks for successful reading. If the file cannot be read, it invokes the `FAILED` macro with an appropriate error message, ensuring that the program does not proceed with `NULL` data. +The program reads the HTML file specified in the command-line argument using the +`lexbor_fs_file_easy_read` function. It stores the content in a dynamic array +and checks for successful reading. If the file cannot be read, it invokes the +`FAILED` macro with an appropriate error message, ensuring that the program does +not proceed with `NULL` data. ### Initializing HTML Encoding -The core logic for handling character encoding begins with the initialization of the `lxb_html_encoding_t` struct via the `lxb_html_encoding_init` function. This struct is essential for managing encoding data throughout the program. If initialization fails, the program handles the error gracefully using the `FAILED` macro again. +The core logic for handling character encoding begins with the initialization of +the `lxb_html_encoding_t` struct via the `lxb_html_encoding_init` function. This +struct is essential for managing encoding data throughout the program. If +initialization fails, the program handles the error gracefully using the +`FAILED` macro again. ### Determining Encoding -The most crucial part of the program is determining the HTML encoding with the `lxb_html_encoding_determine` function. This function analyzes the passed HTML data to determine its encoding. In the previous comment section, there is a mention of a 1024-byte limit, which reflects a common optimization practice where a program doesn't need to read the entire file if a meta encoding tag is typically found within the first 1024 bytes. However, this section is commented out, meaning the program currently reads the complete content. +The most crucial part of the program is determining the HTML encoding with the +`lxb_html_encoding_determine` function. This function analyzes the passed HTML +data to determine its encoding. In the previous comment section, there is a +mention of a 1024-byte limit, which reflects a common optimization practice +where a program doesn't need to read the entire file if a meta encoding tag is +typically found within the first 1024 bytes. However, this section is commented +out, meaning the program currently reads the complete content. ### Printing the Encoding -Once the encoding is determined, the program retrieves the encoding entry using `lxb_html_encoding_meta_entry`. If a valid entry is found, it prints the encoding name. If no encoding is determined, it simply outputs that the encoding was not found. This provides the user with understandable feedback regarding the HTML file's character encoding. +Once the encoding is determined, the program retrieves the encoding entry using +`lxb_html_encoding_meta_entry`. If a valid entry is found, it prints the +encoding name. If no encoding is determined, it simply outputs that the encoding +was not found. This provides the user with understandable feedback regarding the +HTML file's character encoding. ### Cleanup -At the end of the program, whether successful or in the case of an error, memory cleanup is performed. The `lexbor_free` function is called to release the allocated memory for the HTML content, and `lxb_html_encoding_destroy` cleans up the encoding struct. This is an important step to prevent memory leaks and ensure proper resource management. +At the end of the program, whether successful or in the case of an error, memory +cleanup is performed. The `lexbor_free` function is called to release the +allocated memory for the HTML content, and `lxb_html_encoding_destroy` cleans up +the encoding struct. This is an important step to prevent memory leaks and +ensure proper resource management. ## Conclusion -The HTML Encoding example demonstrates essential practices such as error handling, memory management, and the use of a library to enhance functionality. By following this example, developers can understand how to utilize the Lexbor library for encoding detection in HTML documents, while also adhering to proper coding standards for readability and maintainability. \ No newline at end of file +The HTML Encoding example demonstrates essential practices such as error +handling, memory management, and the use of a library to enhance functionality. +By following this example, developers can understand how to utilize the Lexbor +library for encoding detection in HTML documents, while also adhering to proper +coding standards for readability and maintainability. \ No newline at end of file diff --git a/source/examples/html/html2sexpr.md b/source/examples/html/html2sexpr.md index 86751a2..0cf040a 100644 --- a/source/examples/html/html2sexpr.md +++ b/source/examples/html/html2sexpr.md @@ -1,16 +1,27 @@ # HTML to S-Expression Converter Example -This article provides an overview of a code example found in the file [lexbor/html/html2sexpr.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/html2sexpr.c). The program is designed to convert an HTML tag tree into an S-expression string and output it to standard output. The program utilizes the Lexbor library to handle parsing and manipulating HTML documents. +This article provides an overview of a code example found in the file +[lexbor/html/html2sexpr.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/html2sexpr.c). +The program is designed to convert an HTML tag tree into an S-expression string +and output it to standard output. The program utilizes the Lexbor library to +handle parsing and manipulating HTML documents. ## Overview -The program first checks if the correct number of command-line arguments is provided. It expects one argument: the path to an HTML file. It reads the contents of this file and initializes an HTML document object using Lexbor's API. After parsing the HTML, the program invokes a tree-walking function to serialize the HTML structure into an S-expression format. The serialized output is then printed to the console. +The program first checks if the correct number of command-line arguments is +provided. It expects one argument: the path to an HTML file. It reads the +contents of this file and initializes an HTML document object using Lexbor's +API. After parsing the HTML, the program invokes a tree-walking function to +serialize the HTML structure into an S-expression format. The serialized output +is then printed to the console. ## Major Code Sections ### Argument Handling and File Reading -The `main` function begins with argument validation. It ensures that exactly one argument is received; otherwise, it calls the `usage` function, which prints the program's usage instructions to standard error. +The `main` function begins with argument validation. It ensures that exactly one +argument is received; otherwise, it calls the `usage` function, which prints the +program's usage instructions to standard error. ```c if (argc != 2) { @@ -19,7 +30,8 @@ if (argc != 2) { } ``` -Upon validation, the program proceeds to read the HTML file using the `lexbor_fs_file_easy_read` function, which simplifies file reading: +Upon validation, the program proceeds to read the HTML file using the +`lexbor_fs_file_easy_read` function, which simplifies file reading: ```c html = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &html_len); @@ -29,7 +41,9 @@ If file reading fails, it reports an error and resizes relevant resources. ### HTML Document Initialization and Parsing -Next, the code creates an HTML document object with `lxb_html_document_create`. If this allocation fails, it destroys any previously allocated document and frees the memory associated with the HTML content: +Next, the code creates an HTML document object with `lxb_html_document_create`. +If this allocation fails, it destroys any previously allocated document and +frees the memory associated with the HTML content: ```c document = lxb_html_document_create(); @@ -41,13 +55,18 @@ After successfully creating the document, the program parses the HTML content: status = lxb_html_document_parse(document, html, html_len); ``` -This step processes the HTML string and builds a structured representation of the document. +This step processes the HTML string and builds a structured representation of +the document. ### Traversing the DOM and Serializing to S-Expression -The `tree_walker` function is the core of the serialization process. It traverses the DOM tree recursively, converting each node into an S-expression format. +The `tree_walker` function is the core of the serialization process. It +traverses the DOM tree recursively, converting each node into an S-expression +format. -It begins by checking the type of each node. For elements, it calls the serialization callback `cb` to append the opening parenthesis, the node's name, and any attributes: +It begins by checking the type of each node. For elements, it calls the +serialization callback `cb` to append the opening parenthesis, the node's name, +and any attributes: ```c if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { @@ -57,11 +76,16 @@ if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { status = attributes(node, cb, ctx); ``` -The `attributes` function iterates through each node's attributes and formats them as `(attribute_name 'attribute_value)` pairs, again using the callback to transmit this information. +The `attributes` function iterates through each node's attributes and formats +them as `(attribute_name 'attribute_value)` pairs, again using the callback to +transmit this information. ### Handling Template Nodes -The `tree_walker` function includes logic to handle nodes of type `LXB_TAG_TEMPLATE`. If a node is a template and contains child nodes, it recursively calls `tree_walker` on them, ensuring that the contents of the template are also serialized: +The `tree_walker` function includes logic to handle nodes of type +`LXB_TAG_TEMPLATE`. If a node is a template and contains child nodes, it +recursively calls `tree_walker` on them, ensuring that the contents of the +template are also serialized: ```c if (node->local_name == LXB_TAG_TEMPLATE) { @@ -74,7 +98,9 @@ if (node->local_name == LXB_TAG_TEMPLATE) { ### Cleanup and Exit Status -After serialization is complete, the `main` function cleans up by destroying the document and freeing allocated memory. The program concludes by returning an appropriate exit status based on whether the operations succeeded or failed: +After serialization is complete, the `main` function cleans up by destroying the +document and freeing allocated memory. The program concludes by returning an +appropriate exit status based on whether the operations succeeded or failed: ```c lxb_html_document_destroy(document); @@ -82,8 +108,14 @@ lexbor_free(html); return EXIT_SUCCESS; ``` -In the case of failure at any point, the program proceeds to the `failed` label, ensuring resources are released before terminating. +In the case of failure at any point, the program proceeds to the `failed` label, +ensuring resources are released before terminating. ## Conclusion -This example demonstrates a straightforward implementation of converting an HTML document structure into S-expressions using the Lexbor library. The program is structured to handle input validation, document parsing, tree traversal, and serialization efficiently while providing clear feedback in the case of errors. It showcases the use of Lexbor's DOM manipulation capabilities and highlights how to build a recursive tree-walking algorithm for tree serialization. \ No newline at end of file +This example demonstrates a straightforward implementation of converting an HTML +document structure into S-expressions using the Lexbor library. The program is +structured to handle input validation, document parsing, tree traversal, and +serialization efficiently while providing clear feedback in the case of errors. +It showcases the use of Lexbor's DOM manipulation capabilities and highlights +how to build a recursive tree-walking algorithm for tree serialization. \ No newline at end of file diff --git a/source/examples/html/index.md b/source/examples/html/index.md index a7ae38d..658eacf 100644 --- a/source/examples/html/index.md +++ b/source/examples/html/index.md @@ -1,6 +1,7 @@ # HTML Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 1 diff --git a/source/examples/html/parse.md b/source/examples/html/parse.md index 813b49e..d88f0a0 100644 --- a/source/examples/html/parse.md +++ b/source/examples/html/parse.md @@ -1,29 +1,61 @@ # HTML Parsing and Serialization Example -This example demonstrates how to create an HTML parser using the lexbor library, parse simple HTML strings into document objects, and serialize those documents back to a readable format. The code is found in the source file [lexbor/html/parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse.c). +This example demonstrates how to create an HTML parser using the lexbor library, +parse simple HTML strings into document objects, and serialize those documents +back to a readable format. The code is found in the source file +[lexbor/html/parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse.c). ## Code Overview -The program begins by including the necessary header files and defining the main function, which is the entry point for execution. It declares several variables that will be needed throughout the parsing process, including the status of the parser, pointers to HTML document objects, and the HTML strings to be parsed. +The program begins by including the necessary header files and defining the main +function, which is the entry point for execution. It declares several variables +that will be needed throughout the parsing process, including the status of the +parser, pointers to HTML document objects, and the HTML strings to be parsed. ## Initialization -First, the HTML parser is created with `lxb_html_parser_create()`, which allocates memory for the parser. It is essential to check that the parser was created successfully. The program initializes the parser with `lxb_html_parser_init(parser)`, and again checks for successful initialization. If there is a failure at either point, a failure message is printed, and the process is terminated. This aspect of the code ensures that the parser is correctly set up before proceeding further. +First, the HTML parser is created with `lxb_html_parser_create()`, which +allocates memory for the parser. It is essential to check that the parser was +created successfully. The program initializes the parser with +`lxb_html_parser_init(parser)`, and again checks for successful initialization. +If there is a failure at either point, a failure message is printed, and the +process is terminated. This aspect of the code ensures that the parser is +correctly set up before proceeding further. ## Parsing HTML -Next, the program prepares two simple HTML snippets for parsing: `html_one` and `html_two`. These strings represent basic HTML structures containing a `div` with a `p` element. The lengths of these strings are calculated to facilitate parsing. +Next, the program prepares two simple HTML snippets for parsing: `html_one` and +`html_two`. These strings represent basic HTML structures containing a `div` +with a `p` element. The lengths of these strings are calculated to facilitate +parsing. -The parsing occurs with `lxb_html_parse(parser, html_one, html_one_len)`, which attempts to parse the first HTML string and store the resulting document object in `doc_one`. A similar approach is taken for `doc_two`. In both cases, it is crucial to verify that the parsing was successful—if either document object is `NULL`, the program reports a failure. +The parsing occurs with `lxb_html_parse(parser, html_one, html_one_len)`, which +attempts to parse the first HTML string and store the resulting document object +in `doc_one`. A similar approach is taken for `doc_two`. In both cases, it is +crucial to verify that the parsing was successful—if either document object is +`NULL`, the program reports a failure. ## Serialization -Once both documents are successfully created, the program proceeds to serialize them. The method `lxb_html_serialize_pretty_tree_cb()` is called for each document. This function is responsible for converting the document object back into a structured HTML format, with an option for pretty printing. The first argument converts the document into a DOM node interface, while the remaining arguments provide options for serialization. Again, the program checks the status to ensure serialization succeeded. +Once both documents are successfully created, the program proceeds to serialize +them. The method `lxb_html_serialize_pretty_tree_cb()` is called for each +document. This function is responsible for converting the document object back +into a structured HTML format, with an option for pretty printing. The first +argument converts the document into a DOM node interface, while the remaining +arguments provide options for serialization. Again, the program checks the +status to ensure serialization succeeded. ## Cleanup -After serialization, it is important to clean up resources. The program destroys the parser and the HTML document objects with `lxb_html_parser_destroy()` and `lxb_html_document_destroy()`, respectively. This step prevents memory leaks and ensures that all allocated resources are properly released. +After serialization, it is important to clean up resources. The program destroys +the parser and the HTML document objects with `lxb_html_parser_destroy()` and +`lxb_html_document_destroy()`, respectively. This step prevents memory leaks and +ensures that all allocated resources are properly released. ## Conclusion -This example is a clear demonstration of the workflow when utilizing the lexbor library for HTML parsing and serialization. By handling initialization, parsing, serialization, and cleanup, the program effectively showcases how to work with HTML documents in a structured manner. The checks for status at each stage ensure robustness, making it easier to identify issues during development. \ No newline at end of file +This example is a clear demonstration of the workflow when utilizing the lexbor +library for HTML parsing and serialization. By handling initialization, parsing, +serialization, and cleanup, the program effectively showcases how to work with +HTML documents in a structured manner. The checks for status at each stage +ensure robustness, making it easier to identify issues during development. \ No newline at end of file diff --git a/source/examples/html/parse_chunk.md b/source/examples/html/parse_chunk.md index 39a22c0..cd848cc 100644 --- a/source/examples/html/parse_chunk.md +++ b/source/examples/html/parse_chunk.md @@ -1,10 +1,18 @@ # HTML Chunk Parsing Example -This article provides an overview of the HTML chunk parsing example implemented in the source file [lexbor/html/parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse_chunk.c). The example demonstrates how to utilize the Lexbor HTML parsing library to handle HTML data in incremental chunks. By breaking the input into smaller pieces, it showcases the parser's versatility and ability to manage partial data streams effectively. +This article provides an overview of the HTML chunk parsing example implemented +in the source file +[lexbor/html/parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse_chunk.c). +The example demonstrates how to utilize the Lexbor HTML parsing library to +handle HTML data in incremental chunks. By breaking the input into smaller +pieces, it showcases the parser's versatility and ability to manage partial data +streams effectively. ## Code Overview -The main function serves as the entry point for the program. Here, several significant components of the Lexbor library are employed, such as creating a parser, managing HTML documents, and serializing the parsed content. +The main function serves as the entry point for the program. Here, several +significant components of the Lexbor library are employed, such as creating a +parser, managing HTML documents, and serializing the parsed content. ### Initialization @@ -15,19 +23,25 @@ parser = lxb_html_parser_create(); status = lxb_html_parser_init(parser); ``` -In this section, `lxb_html_parser_create()` is called to create a new HTML parser instance. It's crucial to check if the parser was successfully created by examining `status`. If initialization fails, a failure message is displayed. +In this section, `lxb_html_parser_create()` is called to create a new HTML +parser instance. It's crucial to check if the parser was successfully created by +examining `status`. If initialization fails, a failure message is displayed. ### Parsing Chunks -After initialization, the code prepares to parse the HTML content chunk by chunk: +After initialization, the code prepares to parse the HTML content chunk by +chunk: ```c document = lxb_html_parse_chunk_begin(parser); ``` -This line initializes parsing by creating a document object that will hold the parsed data. If the document object is not successfully created, an error message is emitted, halting further execution. +This line initializes parsing by creating a document object that will hold the +parsed data. If the document object is not successfully created, an error +message is emitted, halting further execution. -The program then enters a loop to process the defined HTML chunks stored in a static array: +The program then enters a loop to process the defined HTML chunks stored in a +static array: ```c for (size_t i = 0; html[i][0] != '\0'; i++) { @@ -39,7 +53,10 @@ for (size_t i = 0; html[i][0] != '\0'; i++) { } ``` -Here, `lxb_html_parse_chunk_process()` is called for each chunk of HTML until the end of the array is reached. The function takes two parameters: the parser instance and the length of each HTML chunk. If parsing any chunk fails, it reports the error via the `FAILED` macro. +Here, `lxb_html_parse_chunk_process()` is called for each chunk of HTML until +the end of the array is reached. The function takes two parameters: the parser +instance and the length of each HTML chunk. If parsing any chunk fails, it +reports the error via the `FAILED` macro. ### Finishing the Parsing @@ -49,7 +66,8 @@ After processing all the chunks, the parsing is concluded with: status = lxb_html_parse_chunk_end(parser); ``` -This function finalizes the parsing operation. Like the other stages, it checks if the operation succeeded, and handles any errors accordingly. +This function finalizes the parsing operation. Like the other stages, it checks +if the operation succeeded, and handles any errors accordingly. ### Serialization @@ -61,7 +79,10 @@ status = lxb_html_serialize_pretty_tree_cb(lxb_dom_interface_node(document), 0, serializer_callback, NULL); ``` -This line serializes the parsed HTML tree into a human-readable format. The `lxb_dom_interface_node(document)` retrieves the root node of the parsed document for serialization. The use of the callback function allows for customization in how the output is processed. +This line serializes the parsed HTML tree into a human-readable format. The +`lxb_dom_interface_node(document)` retrieves the root node of the parsed +document for serialization. The use of the callback function allows for +customization in how the output is processed. ### Cleanup @@ -72,8 +93,16 @@ lxb_html_document_destroy(document); lxb_html_parser_destroy(parser); ``` -These calls ensure that the allocated parser and document objects are properly destroyed, freeing resources that are no longer needed. +These calls ensure that the allocated parser and document objects are properly +destroyed, freeing resources that are no longer needed. ## Conclusion -The example provided in [lexbor/html/parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse_chunk.c) is a straightforward illustration of how to parse HTML data incrementally with the Lexbor library. By breaking the input into manageable chunks, the parser can efficiently handle larger HTML documents and offers developers flexibility when processing dynamic or streamed data. This method is particularly useful in web environments where HTML content may not always be available as a single, complete document. \ No newline at end of file +The example provided in +[lexbor/html/parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse_chunk.c) +is a straightforward illustration of how to parse HTML data incrementally with +the Lexbor library. By breaking the input into manageable chunks, the parser can +efficiently handle larger HTML documents and offers developers flexibility when +processing dynamic or streamed data. This method is particularly useful in web +environments where HTML content may not always be available as a single, +complete document. \ No newline at end of file diff --git a/source/examples/html/tokenizer/callback.md b/source/examples/html/tokenizer/callback.md index 1e7cf93..12393fc 100644 --- a/source/examples/html/tokenizer/callback.md +++ b/source/examples/html/tokenizer/callback.md @@ -1,40 +1,72 @@ # HTML Tokenizer Callback Example -This article describes the implementation of an HTML Tokenizer Callback found in the [lexbor/html/tokenizer/callback.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/callback.c) source file. The purpose of this code is to demonstrate how to parse an HTML string and handle tokens as they are generated. It establishes a callback mechanism that is invoked after each token is processed, allowing for custom processing or logging of token data. +This article describes the implementation of an HTML Tokenizer Callback found in +the +[lexbor/html/tokenizer/callback.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/callback.c) +source file. The purpose of this code is to demonstrate how to parse an HTML +string and handle tokens as they are generated. It establishes a callback +mechanism that is invoked after each token is processed, allowing for custom +processing or logging of token data. ## Overview -The code begins by including necessary headers and defining a macro to handle error reporting. It then implements a token callback function, `token_callback`, which retrieves the tag name from a token, determines if the token represents a closing tag, and prints relevant details. The main function orchestrates the creation, initialization, and execution of the tokenizer. +The code begins by including necessary headers and defining a macro to handle +error reporting. It then implements a token callback function, `token_callback`, +which retrieves the tag name from a token, determines if the token represents a +closing tag, and prints relevant details. The main function orchestrates the +creation, initialization, and execution of the tokenizer. ## Error Handling Macro -The code defines a macro, `FAILED`, which simplifies error reporting and exits the program when an error occurs. This macro takes a format string and variadic arguments, prints the error message to standard error, and terminates the program with `EXIT_FAILURE`. This approach centralizes error handling and ensures that the program stops execution on critical failures. +The code defines a macro, `FAILED`, which simplifies error reporting and exits +the program when an error occurs. This macro takes a format string and variadic +arguments, prints the error message to standard error, and terminates the +program with `EXIT_FAILURE`. This approach centralizes error handling and +ensures that the program stops execution on critical failures. ## Token Callback Function -The function `token_callback` is critical as it processes each token generated by the tokenizer. It accepts three parameters: a pointer to the tokenizer, a pointer to the current token, and a context pointer (which is unused in this case). +The function `token_callback` is critical as it processes each token generated +by the tokenizer. It accepts three parameters: a pointer to the tokenizer, a +pointer to the current token, and a context pointer (which is unused in this +case). -Within `token_callback`, the tag name is obtained using `lxb_tag_name_by_id`. If the tag name cannot be retrieved, the macro `FAILED` is invoked to log the error and exit. The token's type is checked to see if it indicates a closing tag. The results, including the tag name, ID, and whether it is a closing tag, are printed to standard output. +Within `token_callback`, the tag name is obtained using `lxb_tag_name_by_id`. If +the tag name cannot be retrieved, the macro `FAILED` is invoked to log the error +and exit. The token's type is checked to see if it indicates a closing tag. The +results, including the tag name, ID, and whether it is a closing tag, are +printed to standard output. ## Main Function Execution Flow The `main` function contains several key operations: -1. **Creating and Initializing the Tokenizer**: - The tokenizer is created using `lxb_html_tokenizer_create` and initialized with `lxb_html_tokenizer_init`. If any of these operations fail, the `FAILED` macro is invoked. +1. **Creating and Initializing the Tokenizer**: The tokenizer is created using + `lxb_html_tokenizer_create` and initialized with `lxb_html_tokenizer_init`. + If any of these operations fail, the `FAILED` macro is invoked. -2. **Setting the Token Callback**: - The tokenizer's callback function is set using `lxb_html_tokenizer_callback_token_done_set`, linking the tokenizer to the `token_callback` function defined earlier. +2. **Setting the Token Callback**: The tokenizer's callback function is set + using `lxb_html_tokenizer_callback_token_done_set`, linking the tokenizer to + the `token_callback` function defined earlier. -3. **Beginning the Tokenization Process**: - The tokenization process is initiated with `lxb_html_tokenizer_begin`. This prepares the tokenizer for consuming HTML data. +3. **Beginning the Tokenization Process**: The tokenization process is initiated + with `lxb_html_tokenizer_begin`. This prepares the tokenizer for consuming + HTML data. -4. **Processing HTML Data**: - The provided HTML string (`"
test
"`) is processed by calling `lxb_html_tokenizer_chunk`, which reads a chunk of HTML to tokenize. After processing, the tokenizer is signaled to end its operation with `lxb_html_tokenizer_end`. +4. **Processing HTML Data**: The provided HTML string + (`"
test
"`) is processed by calling + `lxb_html_tokenizer_chunk`, which reads a chunk of HTML to tokenize. After + processing, the tokenizer is signaled to end its operation with + `lxb_html_tokenizer_end`. -5. **Cleanup**: - Finally, the tokenizer is destroyed using `lxb_html_tokenizer_destroy`, freeing up any resources allocated during its operation. +5. **Cleanup**: Finally, the tokenizer is destroyed using + `lxb_html_tokenizer_destroy`, freeing up any resources allocated during its + operation. ## Summary -This example illustrates the use of a callback function within a tokenizer to process HTML tokens sequentially. By gracefully handling errors and providing hooks for further processing, the code affords flexibility and clarity in parsing HTML inputs using the lexbor library. It exemplifies best practices in resource management, modular function design, and effective error handling in C. \ No newline at end of file +This example illustrates the use of a callback function within a tokenizer to +process HTML tokens sequentially. By gracefully handling errors and providing +hooks for further processing, the code affords flexibility and clarity in +parsing HTML inputs using the lexbor library. It exemplifies best practices in +resource management, modular function design, and effective error handling in C. \ No newline at end of file diff --git a/source/examples/html/tokenizer/simple.md b/source/examples/html/tokenizer/simple.md index 773a4b6..de96c1f 100644 --- a/source/examples/html/tokenizer/simple.md +++ b/source/examples/html/tokenizer/simple.md @@ -1,37 +1,69 @@ # HTML Tokenizer Example -This article provides a detailed explanation of an HTML tokenizer example implemented in C, demonstrating the capabilities of the lexbor library through the file [lexbor/html/tokenizer/simple.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/simple.c). This code is intended to parse a simple HTML string and display the tokens generated by the tokenizer. +This article provides a detailed explanation of an HTML tokenizer example +implemented in C, demonstrating the capabilities of the lexbor library through +the file +[lexbor/html/tokenizer/simple.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/simple.c). +This code is intended to parse a simple HTML string and display the tokens +generated by the tokenizer. ## Code Overview -The main function of this code, `main`, initializes the tokenizer, sets a callback for token processing, and then processes a predefined HTML string. The tokenizer handles the parsing by breaking the HTML into tokens, which are processed by the `token_callback` function. +The main function of this code, `main`, initializes the tokenizer, sets a +callback for token processing, and then processes a predefined HTML string. The +tokenizer handles the parsing by breaking the HTML into tokens, which are +processed by the `token_callback` function. ### Tokenization Process -1. **Initialization**: The tokenizer is created and initialized with `lxb_html_tokenizer_create()` and `lxb_html_tokenizer_init()`. If initialization fails, an error message is printed using the `FAILED` macro, which handles error reporting and exits the program. +1. **Initialization**: The tokenizer is created and initialized with + `lxb_html_tokenizer_create()` and `lxb_html_tokenizer_init()`. If + initialization fails, an error message is printed using the `FAILED` macro, + which handles error reporting and exits the program. -2. **Token Callback**: The `token_callback` function is registered as a callback to handle tokens generated by the tokenizer. This function processes different types of tokens: - - **End of File Token**: If the token indicates the end of the input, the function simply returns it. - - **Text Token**: If it is a text token, the function prints the text content enclosed by the appropriate markers. - - **HTML Tags**: For opening and closing tags, the function prints the tag name. If there are attributes, it prints them along with their values. The handling of attribute values takes care of different quoting styles (e.g., single or double quotes). +2. **Token Callback**: The `token_callback` function is registered as a callback + to handle tokens generated by the tokenizer. This function processes + different types of tokens: + - **End of File Token**: If the token indicates the end of the input, the + function simply returns it. + - **Text Token**: If it is a text token, the function prints the text content + enclosed by the appropriate markers. + - **HTML Tags**: For opening and closing tags, the function prints the tag + name. If there are attributes, it prints them along with their values. The + handling of attribute values takes care of different quoting styles (e.g., + single or double quotes). ### Main Function Execution -The main logic begins with defining an HTML string using an array of characters. It prints the original HTML for clarity: +The main logic begins with defining an HTML string using an array of characters. +It prints the original HTML for clarity: ```c const lxb_char_t data[] = "
© Hi" " my friend
"; ``` -3. **Begin Tokenization**: After setting up the tokenizer and establishing the callback function, the tokenization process begins with `lxb_html_tokenizer_begin(tkz)`, which prepares the tokenizer to accept input. +3. **Begin Tokenization**: After setting up the tokenizer and establishing the + callback function, the tokenization process begins with + `lxb_html_tokenizer_begin(tkz)`, which prepares the tokenizer to accept + input. -4. **Input Chunk Processing**: The example HTML data is passed to the tokenizer using `lxb_html_tokenizer_chunk(tkz, data, (sizeof(data) - 1))`. The tokenizer processes this chunk of data, generating tokens as defined in the callback function. +4. **Input Chunk Processing**: The example HTML data is passed to the tokenizer + using `lxb_html_tokenizer_chunk(tkz, data, (sizeof(data) - 1))`. The + tokenizer processes this chunk of data, generating tokens as defined in the + callback function. -5. **Ending Tokenization**: After processing the input, the tokenizer is finalized with `lxb_html_tokenizer_end(tkz)`, ensuring that all tokens are properly flushed and processed. +5. **Ending Tokenization**: After processing the input, the tokenizer is + finalized with `lxb_html_tokenizer_end(tkz)`, ensuring that all tokens are + properly flushed and processed. -6. **Cleanup**: Finally, the tokenizer resources are released with `lxb_html_tokenizer_destroy(tkz)` to prevent memory leaks. +6. **Cleanup**: Finally, the tokenizer resources are released with + `lxb_html_tokenizer_destroy(tkz)` to prevent memory leaks. ## Conclusion -The provided example illustrates the basic functionality of the lexbor HTML tokenizer. It demonstrates how to set up a tokenizer, process HTML data, and define a callback to handle tokenization events. This example can serve as a foundation for more complex HTML processing tasks using lexbor, which is designed to efficiently handle HTML parsing requirements. \ No newline at end of file +The provided example illustrates the basic functionality of the lexbor HTML +tokenizer. It demonstrates how to set up a tokenizer, process HTML data, and +define a callback to handle tokenization events. This example can serve as a +foundation for more complex HTML processing tasks using lexbor, which is +designed to efficiently handle HTML parsing requirements. \ No newline at end of file diff --git a/source/examples/html/tokenizer/tag_attributes.md b/source/examples/html/tokenizer/tag_attributes.md index d5703c6..99e802f 100644 --- a/source/examples/html/tokenizer/tag_attributes.md +++ b/source/examples/html/tokenizer/tag_attributes.md @@ -1,10 +1,18 @@ # Tokenization and Attribute Extraction Example -This article explains the code found in the `tag_attributes.c` file of the lexbor project, which focuses on the tokenization of HTML content and the extraction of attributes from tokens. The primary purpose of this code is to parse a small fragment of HTML and output the attributes associated with each token. +This article explains the code found in the `tag_attributes.c` file of the +lexbor project, which focuses on the tokenization of HTML content and the +extraction of attributes from tokens. The primary purpose of this code is to +parse a small fragment of HTML and output the attributes associated with each +token. ## Overview -The `tag_attributes.c` file implements a simple HTML tokenizer. It initializes a tokenizer instance, feeds it some HTML data, and uses a callback function to process and display the attributes of parsed tokens. The tokenizer effectively handles different HTML tags and their attributes while logging any potential errors that may occur during the process. +The `tag_attributes.c` file implements a simple HTML tokenizer. It initializes a +tokenizer instance, feeds it some HTML data, and uses a callback function to +process and display the attributes of parsed tokens. The tokenizer effectively +handles different HTML tags and their attributes while logging any potential +errors that may occur during the process. ## Code Breakdown @@ -17,9 +25,12 @@ The file begins with including necessary header files: #include "lexbor/html/token_attr.h" ``` -These headers provide definitions and functionalities related to HTML tokenization and attribute handling. +These headers provide definitions and functionalities related to HTML +tokenization and attribute handling. -The `FAILED` macro is defined to streamline error handling throughout the code. This macro takes a format string and variable arguments, prints the error message to standard error, and exits the program if an issue arises. +The `FAILED` macro is defined to streamline error handling throughout the code. +This macro takes a format string and variable arguments, prints the error +message to standard error, and exits the program if an issue arises. ### Token Callback Function @@ -30,7 +41,8 @@ static lxb_html_token_t * token_callback(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx) ``` -This function is called whenever a token is completed. It first checks if the token is a text node or has no attributes: +This function is called whenever a token is completed. It first checks if the +token is a text node or has no attributes: ```c if (token->tag_id == LXB_TAG__TEXT || attr == NULL) { @@ -38,7 +50,10 @@ if (token->tag_id == LXB_TAG__TEXT || attr == NULL) { } ``` -If the token is a text node or has no attributes, the function returns immediately without further processing. Otherwise, it retrieves the name of the tag associated with the token using `lxb_tag_name_by_id`. A failure at this point will invoke the `FAILED` macro: +If the token is a text node or has no attributes, the function returns +immediately without further processing. Otherwise, it retrieves the name of the +tag associated with the token using `lxb_tag_name_by_id`. A failure at this +point will invoke the `FAILED` macro: ```c tag = lxb_tag_name_by_id(token->tag_id, NULL); @@ -47,14 +62,19 @@ if (tag == NULL) { } ``` -Assuming the tag name retrieval is successful, it prints out the tag's attributes. The `while` loop iterates through the list of attributes associated with the token: +Assuming the tag name retrieval is successful, it prints out the tag's +attributes. The `while` loop iterates through the list of attributes associated +with the token: ```c while (attr != NULL) { name = lxb_html_token_attr_name(attr, NULL); ``` -For each attribute found, it checks if the name is valid; if not, it acknowledges the situation by noting that the name is not set, particularly handling tokens like `DOCTYPE`. The associated values of the attributes are likewise printed if they exist. +For each attribute found, it checks if the name is valid; if not, it +acknowledges the situation by noting that the name is not set, particularly +handling tokens like `DOCTYPE`. The associated values of the attributes are +likewise printed if they exist. ### Main Function @@ -64,7 +84,9 @@ The `main` function orchestrates the entire process: int main(int argc, const char *argv[]) ``` -This function initializes the tokenizer and sets up the HTML string for parsing. The HTML fragment being parsed includes a `div` tag with several attributes and nested `option` tags. It first prints the HTML string to the console: +This function initializes the tokenizer and sets up the HTML string for parsing. +The HTML fragment being parsed includes a `div` tag with several attributes and +nested `option` tags. It first prints the HTML string to the console: ```c const lxb_char_t data[] = "
" @@ -79,12 +101,21 @@ tkz = lxb_html_tokenizer_create(); status = lxb_html_tokenizer_init(tkz); ``` -In case of an error during the tokenizer's creation or initialization, it utilizes the `FAILED` macro to handle the error appropriately. +In case of an error during the tokenizer's creation or initialization, it +utilizes the `FAILED` macro to handle the error appropriately. -The callback function for token completion is set, and the tokenizer begins processing the HTML data. It processes the input by calling `lxb_html_tokenizer_chunk`, and if any issues arise during these stages, the `FAILED` macro is utilized once more to identify failures in parsing. +The callback function for token completion is set, and the tokenizer begins +processing the HTML data. It processes the input by calling +`lxb_html_tokenizer_chunk`, and if any issues arise during these stages, the +`FAILED` macro is utilized once more to identify failures in parsing. -Finally, the tokenizer is destroyed, freeing any resources it allocated during its execution, and the program returns 0, indicating a successful run. +Finally, the tokenizer is destroyed, freeing any resources it allocated during +its execution, and the program returns 0, indicating a successful run. ## Conclusion -This example illustrates the process of HTML tokenization using the lexbor library. By implementing a callback to handle parsed tokens, the code effectively extracts and displays attribute names and values from the given HTML fragment. It showcases the ability to manage errors gracefully while providing informative output for attribute processing within tokens. \ No newline at end of file +This example illustrates the process of HTML tokenization using the lexbor +library. By implementing a callback to handle parsed tokens, the code +effectively extracts and displays attribute names and values from the given HTML +fragment. It showcases the ability to manage errors gracefully while providing +informative output for attribute processing within tokens. \ No newline at end of file diff --git a/source/examples/html/tokenizer/text.md b/source/examples/html/tokenizer/text.md index dc94dae..6498210 100644 --- a/source/examples/html/tokenizer/text.md +++ b/source/examples/html/tokenizer/text.md @@ -1,16 +1,26 @@ # HTML Tokenizer Example -This article describes the functionality of the example code provided in the file [lexbor/html/tokenizer/text.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/text.c). The code implements an HTML tokenizer using the Lexbor library, focusing on extracting and printing text tokens from HTML input. +This article describes the functionality of the example code provided in the +file +[lexbor/html/tokenizer/text.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/text.c). +The code implements an HTML tokenizer using the Lexbor library, focusing on +extracting and printing text tokens from HTML input. ## Overview of the Code -The main thrust of this code is to parse HTML data, identify text tokens within it, and print those tokens to the standard output. The code utilizes functions provided by the Lexbor library, a lightweight and efficient HTML and XML processing library. +The main thrust of this code is to parse HTML data, identify text tokens within +it, and print those tokens to the standard output. The code utilizes functions +provided by the Lexbor library, a lightweight and efficient HTML and XML +processing library. ## Key Sections of the Code ### Header and Macros -The code begins with the inclusion of the `lexbor/html/tokenizer.h` header file, which contains the necessary declarations for using the tokenizer functionality of the Lexbor library. Following this, a macro named `FAILED` is defined. This macro can be used throughout the code to simplify error handling: +The code begins with the inclusion of the `lexbor/html/tokenizer.h` header file, +which contains the necessary declarations for using the tokenizer functionality +of the Lexbor library. Following this, a macro named `FAILED` is defined. This +macro can be used throughout the code to simplify error handling: ```c #define FAILED(...) \ @@ -22,11 +32,13 @@ The code begins with the inclusion of the `lexbor/html/tokenizer.h` header file, while (0) ``` -It takes a format string and arguments to generate error messages. When invoked, it prints the message to standard error and terminates the program. +It takes a format string and arguments to generate error messages. When invoked, +it prints the message to standard error and terminates the program. ### Token Callback Function -Next, there is the `token_callback` function that manages the processing of tokens emitted by the tokenizer: +Next, there is the `token_callback` function that manages the processing of +tokens emitted by the tokenizer: ```c static lxb_html_token_t * @@ -44,7 +56,12 @@ token_callback(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx) } ``` -The function checks whether the token is a text token (identified by `LXB_TAG__TEXT`). If it is not, it simply returns the token without further processing. For text tokens, it prints the text content to standard output using the `printf` function. This content is extracted from the token's `text_start` and `text_end` fields, which indicate the starting and ending positions of the text within the HTML data. +The function checks whether the token is a text token (identified by +`LXB_TAG__TEXT`). If it is not, it simply returns the token without further +processing. For text tokens, it prints the text content to standard output using +the `printf` function. This content is extracted from the token's `text_start` +and `text_end` fields, which indicate the starting and ending positions of the +text within the HTML data. ### Main Function @@ -80,12 +97,26 @@ int main(int argc, const char *argv[]) } ``` -The HTML input is defined as a character array that includes HTML elements and character references. The code creates a tokenizer instance using `lxb_html_tokenizer_create()` and initializes it with `lxb_html_tokenizer_init()`. If these operations fail, the `FAILED` macro is called to report the issue and exit. +The HTML input is defined as a character array that includes HTML elements and +character references. The code creates a tokenizer instance using +`lxb_html_tokenizer_create()` and initializes it with +`lxb_html_tokenizer_init()`. If these operations fail, the `FAILED` macro is +called to report the issue and exit. -The tokenizer callback is set through `lxb_html_tokenizer_callback_token_done_set()`, linking the `token_callback` function to handle tokens once they are fully parsed. The main parsing operations occur through `lxb_html_tokenizer_begin()` and `lxb_html_tokenizer_chunk()`, processing the data until the end of the input with `lxb_html_tokenizer_end()`. +The tokenizer callback is set through +`lxb_html_tokenizer_callback_token_done_set()`, linking the `token_callback` +function to handle tokens once they are fully parsed. The main parsing +operations occur through `lxb_html_tokenizer_begin()` and +`lxb_html_tokenizer_chunk()`, processing the data until the end of the input +with `lxb_html_tokenizer_end()`. -Finally, the tokenizer instance is destroyed with `lxb_html_tokenizer_destroy(tkz)`, which frees up resources allocated during the process. +Finally, the tokenizer instance is destroyed with +`lxb_html_tokenizer_destroy(tkz)`, which frees up resources allocated during the +process. ## Conclusion -This example provides a clear illustration of how to utilize the Lexbor library to parse HTML and process text tokens. By focusing on text tokens, and employing proper error handling mechanics, the code demonstrates a concise yet effective approach to basic HTML tokenization. \ No newline at end of file +This example provides a clear illustration of how to utilize the Lexbor library +to parse HTML and process text tokens. By focusing on text tokens, and employing +proper error handling mechanics, the code demonstrates a concise yet effective +approach to basic HTML tokenization. \ No newline at end of file diff --git a/source/examples/index.md b/source/examples/index.md index caaba33..bb8b8fe 100644 --- a/source/examples/index.md +++ b/source/examples/index.md @@ -1,6 +1,7 @@ # Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 2 diff --git a/source/examples/punycode/decode.md b/source/examples/punycode/decode.md index 2d182f3..c606294 100644 --- a/source/examples/punycode/decode.md +++ b/source/examples/punycode/decode.md @@ -1,14 +1,23 @@ # Punycode Decoding Example -This article explains the implementation of a Punycode decoding utility found in the [lexbor/punycode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/decode.c) file. The code example facilitates the decoding of encoded domain names into their regular representation, which is critical for handling internationalized domain names (IDNs). +This article explains the implementation of a Punycode decoding utility found in +the +[lexbor/punycode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/decode.c) +file. The code example facilitates the decoding of encoded domain names into +their regular representation, which is critical for handling internationalized +domain names (IDNs). ## Overview -The core function of this program reads input from standard input, decodes it using the Lexbor library's Punycode functionality, and outputs the decoded string to standard output. Below, we detail the main components of the code, their functionality, and the logic behind the operations. +The core function of this program reads input from standard input, decodes it +using the Lexbor library's Punycode functionality, and outputs the decoded +string to standard output. Below, we detail the main components of the code, +their functionality, and the logic behind the operations. ## Main Function -The `main` function serves as the entry point of the program. It sets up the necessary variables and handles the reading, reallocating, and decoding of data. +The `main` function serves as the entry point of the program. It sets up the +necessary variables and handles the reading, reallocating, and decoding of data. ### Variable Declarations @@ -22,7 +31,9 @@ The program begins by declaring several important variables: ### Memory Allocation -Memory is allocated for `buf` using `lexbor_malloc`, which allocates space equal to the size of `inbuf`. If memory allocation fails, the program outputs an error message and exits with `EXIT_FAILURE`. +Memory is allocated for `buf` using `lexbor_malloc`, which allocates space equal +to the size of `inbuf`. If memory allocation fails, the program outputs an error +message and exits with `EXIT_FAILURE`. ### Reading Input @@ -32,11 +43,14 @@ The program enters a `do-while` loop to read from standard input: size = fread(inbuf, 1, sizeof(inbuf), stdin); ``` -If the read operation does not return the full buffer size, it checks if the end of the file (EOF) is reached or if an error occurred. In either case, the program handles these conditions appropriately. +If the read operation does not return the full buffer size, it checks if the end +of the file (EOF) is reached or if an error occurred. In either case, the +program handles these conditions appropriately. ### Buffer Management -Before storing more data into `buf`, the program checks if there is enough space: +Before storing more data into `buf`, the program checks if there is enough +space: ```c if (p + size > end) { @@ -46,36 +60,49 @@ if (p + size > end) { } ``` -If there isn't sufficient space, it reallocates memory to increase the buffer size by threefold. If this operation fails, an error message is displayed and the program jumps to the `failed` label to free allocated memory and exit. +If there isn't sufficient space, it reallocates memory to increase the buffer +size by threefold. If this operation fails, an error message is displayed and +the program jumps to the `failed` label to free allocated memory and exit. ### Input Cleaning -After reading input, the program checks and trims any trailing newline (`\n`) or carriage return (`\r`) characters for proper formatting before decoding begins. +After reading input, the program checks and trims any trailing newline (`\n`) or +carriage return (`\r`) characters for proper formatting before decoding begins. ### Decoding Process -The actual decoding is performed by the `lxb_punycode_decode` function, which takes the prepared buffer and calls a callback function: +The actual decoding is performed by the `lxb_punycode_decode` function, which +takes the prepared buffer and calls a callback function: ```c status = lxb_punycode_decode(buf, p - buf, callback, NULL); ``` -This function executes the decoding, and if it fails, an error message is printed, and cleanup is performed. +This function executes the decoding, and if it fails, an error message is +printed, and cleanup is performed. ### Output and Cleanup -Once decoding is successful, the program prints a newline for formatting and then frees the allocated memory before exiting successfully. +Once decoding is successful, the program prints a newline for formatting and +then frees the allocated memory before exiting successfully. ## Callback Function -The `callback` function is defined to handle the output of each decoded segment. It receives the decoded data and its length, printing it to standard output: +The `callback` function is defined to handle the output of each decoded segment. +It receives the decoded data and its length, printing it to standard output: ```c printf("%.*s", (int) len, (const char *) data); ``` -This function is simple yet crucial, as it formats and handles how the decoded data is displayed. +This function is simple yet crucial, as it formats and handles how the decoded +data is displayed. ## Conclusion -This example demonstrates how to utilize the Lexbor library for Punycode decoding in C. The program handles memory management, input reading, and decoding efficiently while ensuring robustness against common issues like memory allocation failures. Through this utility, developers can work with internationalized domain names effectively, translating them into human-readable forms. \ No newline at end of file +This example demonstrates how to utilize the Lexbor library for Punycode +decoding in C. The program handles memory management, input reading, and +decoding efficiently while ensuring robustness against common issues like memory +allocation failures. Through this utility, developers can work with +internationalized domain names effectively, translating them into human-readable +forms. \ No newline at end of file diff --git a/source/examples/punycode/encode.md b/source/examples/punycode/encode.md index 89fd2cc..ce86dc6 100644 --- a/source/examples/punycode/encode.md +++ b/source/examples/punycode/encode.md @@ -1,14 +1,26 @@ # Punycode Encoding Example -This article discusses the code example found in the file [lexbor/punycode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/encode.c), which demonstrates how to encode a string using the Punycode algorithm with the lexbor library. Punycode is a way to represent Internationalized Domain Names (IDNs) using only ASCII characters. This code facilitates reading input data, manages memory allocation dynamically, and encodes the input using a callback function to handle the output. +This article discusses the code example found in the file +[lexbor/punycode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/encode.c), +which demonstrates how to encode a string using the Punycode algorithm with the +lexbor library. Punycode is a way to represent Internationalized Domain Names +(IDNs) using only ASCII characters. This code facilitates reading input data, +manages memory allocation dynamically, and encodes the input using a callback +function to handle the output. ## Code Explanation -The main function plays a central role in this example. It starts by defining several variables for handling the buffer, input data, and status codes. An important portion of the code is responsible for memory management, particularly the allocation and potential reallocation of memory needed to store the input. +The main function plays a central role in this example. It starts by defining +several variables for handling the buffer, input data, and status codes. An +important portion of the code is responsible for memory management, particularly +the allocation and potential reallocation of memory needed to store the input. ### Memory Allocation -The first crucial step involves allocating memory for the buffer, which will hold the input data. The `lexbor_malloc` function is called to allocate memory equivalent to the size of `inbuf`. If the allocation fails, an error message is printed, and the program exits with `EXIT_FAILURE`. +The first crucial step involves allocating memory for the buffer, which will +hold the input data. The `lexbor_malloc` function is called to allocate memory +equivalent to the size of `inbuf`. If the allocation fails, an error message is +printed, and the program exits with `EXIT_FAILURE`. ```c buf = lexbor_malloc(sizeof(inbuf)); @@ -20,7 +32,10 @@ if (buf == NULL) { ### Reading Input -The program uses a loop to read input from standard input using `fread`. It attempts to read up to `sizeof(inbuf)` bytes into `inbuf`. After reading, it checks if the end of the file is reached and appropriately modifies the loop control variable. +The program uses a loop to read input from standard input using `fread`. It +attempts to read up to `sizeof(inbuf)` bytes into `inbuf`. After reading, it +checks if the end of the file is reached and appropriately modifies the loop +control variable. ```c size = fread(inbuf, 1, sizeof(inbuf), stdin); @@ -36,7 +51,10 @@ if (size != sizeof(inbuf)) { ### Handling Buffer Overflow -Another significant section of the code checks whether the size of the input exceeds the buffer's capacity. If it does, it reallocates memory for the buffer using `lexbor_realloc`, aiming to increase its size by a multiple of three. This is a proactive approach to accommodating larger inputs. +Another significant section of the code checks whether the size of the input +exceeds the buffer's capacity. If it does, it reallocates memory for the buffer +using `lexbor_realloc`, aiming to increase its size by a multiple of three. This +is a proactive approach to accommodating larger inputs. ```c if (p + size > end) { @@ -56,7 +74,10 @@ if (p + size > end) { ### Encoding Input -Once the input is collected and appropriately buffered, the code trims any trailing newline or carriage return characters. It then calls the `lxb_punycode_encode` function, passing the buffer and the length of the data, as well as a callback function to handle the encoded output. +Once the input is collected and appropriately buffered, the code trims any +trailing newline or carriage return characters. It then calls the +`lxb_punycode_encode` function, passing the buffer and the length of the data, +as well as a callback function to handle the encoded output. ```c status = lxb_punycode_encode(buf, p - buf, callback, NULL); @@ -66,7 +87,9 @@ if (status != LXB_STATUS_OK) { } ``` -The callback function `callback` is defined later in the file. It simply prints the encoded data back to standard output, handling any Unicode to ASCII conversions that may be necessary. +The callback function `callback` is defined later in the file. It simply prints +the encoded data back to standard output, handling any Unicode to ASCII +conversions that may be necessary. ```c static lxb_status_t @@ -80,7 +103,9 @@ callback(const lxb_char_t *data, size_t len, void *ctx, bool unchanged) ### Cleanup and Error Handling -Throughout the code, error handling is emphasized. If any memory operation fails, the program exits gracefully by freeing any allocated memory before termination. This ensures that the application does not lead to memory leaks. +Throughout the code, error handling is emphasized. If any memory operation +fails, the program exits gracefully by freeing any allocated memory before +termination. This ensures that the application does not lead to memory leaks. ```c failed: @@ -90,4 +115,11 @@ failed: ## Conclusion -This article provides a comprehensive overview of the [lexbor/punycode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/encode.c) example, illustrating how to implement Punycode encoding in C. The example highlights important practices such as dynamic memory management, error handling, and the use of callback functions, which are all vital when dealing with input and output in systems programming. By following this structured approach, developers can efficiently utilize the lexbor library to handle Internationalized Domain Names. \ No newline at end of file +This article provides a comprehensive overview of the +[lexbor/punycode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/encode.c) +example, illustrating how to implement Punycode encoding in C. The example +highlights important practices such as dynamic memory management, error +handling, and the use of callback functions, which are all vital when dealing +with input and output in systems programming. By following this structured +approach, developers can efficiently utilize the lexbor library to handle +Internationalized Domain Names. \ No newline at end of file diff --git a/source/examples/punycode/index.md b/source/examples/punycode/index.md index 9103ac2..6768317 100644 --- a/source/examples/punycode/index.md +++ b/source/examples/punycode/index.md @@ -1,6 +1,7 @@ # Punycode Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 1 diff --git a/source/examples/selectors/easy_way.md b/source/examples/selectors/easy_way.md index 1fc5592..f8f17d8 100644 --- a/source/examples/selectors/easy_way.md +++ b/source/examples/selectors/easy_way.md @@ -1,37 +1,73 @@ # CSS Selectors Usage Example -This article explains an example program found in the file [lexbor/selectors/easy_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/selectors/easy_way.c), which demonstrates how to use the Lexbor library to parse HTML and match it against CSS selectors. The example involves creating an HTML document, defining CSS selectors, and then finding matching nodes in the document. +This article explains an example program found in the file +[lexbor/selectors/easy_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/selectors/easy_way.c), +which demonstrates how to use the Lexbor library to parse HTML and match it +against CSS selectors. The example involves creating an HTML document, defining +CSS selectors, and then finding matching nodes in the document. ## Overview of the Code -The program begins with the inclusion of necessary headers from the Lexbor library, specifically for handling HTML documents and CSS selectors. The primary functionalities are encapsulated in multiple functions, including the `callback` function, which prints matched nodes, and the `find_callback` function, which keeps track of the count of found nodes. +The program begins with the inclusion of necessary headers from the Lexbor +library, specifically for handling HTML documents and CSS selectors. The primary +functionalities are encapsulated in multiple functions, including the `callback` +function, which prints matched nodes, and the `find_callback` function, which +keeps track of the count of found nodes. ### Function Definitions -- **callback**: This function acts as a callback for serializing HTML nodes. It takes a pointer to data representing the node's content and its length, printing the content to the standard output. +- **callback**: This function acts as a callback for serializing HTML nodes. It + takes a pointer to data representing the node's content and its length, + printing the content to the standard output. -- **find_callback**: This callback function is invoked for each matching node found by the CSS selectors. It increments the count of matched nodes, prints the count, and calls the serialization callback to output the node's content. +- **find_callback**: This callback function is invoked for each matching node + found by the CSS selectors. It increments the count of matched nodes, prints + the count, and calls the serialization callback to output the node's content. ### Main Function Breakdown -1. **Initialization**: The `main` function begins by declaring variables for counting matches, managing the status of various operations, and holding references to the document, selectors, parser, and selector list. +1. **Initialization**: The `main` function begins by declaring variables for + counting matches, managing the status of various operations, and holding + references to the document, selectors, parser, and selector list. -2. **HTML and CSS Data**: The example defines a string of HTML containing a `div` with two `p` elements and a string of CSS selectors to match. Specifically, the selectors include a class selector (`.x`) and a compound selector that checks for a `p` element with an `id` of 'y'. +2. **HTML and CSS Data**: The example defines a string of HTML containing a + `div` with two `p` elements and a string of CSS selectors to match. + Specifically, the selectors include a class selector (`.x`) and a compound + selector that checks for a `p` element with an `id` of 'y'. -3. **Creating an HTML Document**: An HTML document object is created and initialized with the HTML string. The document must be parsed successfully; otherwise, the program exits with a failure status. +3. **Creating an HTML Document**: An HTML document object is created and + initialized with the HTML string. The document must be parsed successfully; + otherwise, the program exits with a failure status. -4. **CSS Parser Setup**: A CSS parser object is created and initialized, which is necessary for processing the selector strings. +4. **CSS Parser Setup**: A CSS parser object is created and initialized, which + is necessary for processing the selector strings. -5. **Selectors Creation**: A selectors object is initialized to handle the parsing of the CSS selectors. This involves calling `lxb_selectors_create` and then initializing it with `lxb_selectors_init`. +5. **Selectors Creation**: A selectors object is initialized to handle the + parsing of the CSS selectors. This involves calling `lxb_selectors_create` + and then initializing it with `lxb_selectors_init`. -6. **Parsing Selectors**: The CSS selectors string is parsed, and a list of selectors is generated using `lxb_css_selectors_parse`. The status is checked to ensure that parsing was successful. +6. **Parsing Selectors**: The CSS selectors string is parsed, and a list of + selectors is generated using `lxb_css_selectors_parse`. The status is checked + to ensure that parsing was successful. -7. **Serialization of Selectors**: The program prints out the serialized selectors using `lxb_css_selector_serialize_list_chain`, which utilizes the previously defined `callback` function to output each selector. +7. **Serialization of Selectors**: The program prints out the serialized + selectors using `lxb_css_selector_serialize_list_chain`, which utilizes the + previously defined `callback` function to output each selector. -8. **Finding Matching Nodes**: The program identifies the body of the HTML document and utilizes the `lxb_selectors_find` function to locate nodes that match the defined selectors. The `find_callback` function processes each matching node. +8. **Finding Matching Nodes**: The program identifies the body of the HTML + document and utilizes the `lxb_selectors_find` function to locate nodes that + match the defined selectors. The `find_callback` function processes each + matching node. -9. **Memory Management**: After processing, the program properly deallocates memory used for selectors, the CSS parser, and the HTML document to prevent memory leaks. +9. **Memory Management**: After processing, the program properly deallocates + memory used for selectors, the CSS parser, and the HTML document to prevent + memory leaks. ### Conclusion -This example demonstrates the effective use of the Lexbor library for manipulating and selecting elements within HTML documents based on CSS selectors. By understanding how to parse both HTML and CSS, and by using callback functions to manage matched nodes, developers can efficiently implement feature-rich web applications. The careful structure of the code ensures maintainability and readability, adhering to best practices in C programming. \ No newline at end of file +This example demonstrates the effective use of the Lexbor library for +manipulating and selecting elements within HTML documents based on CSS +selectors. By understanding how to parse both HTML and CSS, and by using +callback functions to manage matched nodes, developers can efficiently implement +feature-rich web applications. The careful structure of the code ensures +maintainability and readability, adhering to best practices in C programming. \ No newline at end of file diff --git a/source/examples/selectors/index.md b/source/examples/selectors/index.md index a358b60..90dc6c7 100644 --- a/source/examples/selectors/index.md +++ b/source/examples/selectors/index.md @@ -1,6 +1,7 @@ # Selectors Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 1 diff --git a/source/examples/selectors/normal_way.md b/source/examples/selectors/normal_way.md index 6d07d7d..96454e5 100644 --- a/source/examples/selectors/normal_way.md +++ b/source/examples/selectors/normal_way.md @@ -1,20 +1,37 @@ # CSS Selectors Parsing and Node Finding Example -This example, found in the source file [lexbor/selectors/normal_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/selectors/normal_way.c), demonstrates how to use the Lexbor library to parse CSS selectors and find HTML nodes that match those selectors. The code provides a comprehensive workflow, from creating an HTML document to parsing selectors and retrieving matching nodes while handling memory management efficiently. +This example, found in the source file +[lexbor/selectors/normal_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/selectors/normal_way.c), +demonstrates how to use the Lexbor library to parse CSS selectors and find HTML +nodes that match those selectors. The code provides a comprehensive workflow, +from creating an HTML document to parsing selectors and retrieving matching +nodes while handling memory management efficiently. ## Overview of Key Components -The main function serves as the central processing unit of the code, orchestrating the various tasks. It initializes necessary structures, parses an HTML string, sets up CSS selectors, and employs the Lexbor library's capabilities to find nodes in the document. +The main function serves as the central processing unit of the code, +orchestrating the various tasks. It initializes necessary structures, parses an +HTML string, sets up CSS selectors, and employs the Lexbor library's +capabilities to find nodes in the document. ### HTML and CSS Data -The example uses the HTML string `"

abc

"`, which contains two `

` elements, one with class attributes `x` and `z`, and another with the ID `y`. This HTML will be parsed to create a document object. +The example uses the HTML string `"

abc

"`, which contains two `

` elements, one with class +attributes `x` and `z`, and another with the ID `y`. This HTML will be parsed to +create a document object. -Two CSS selector strings are defined: `".x, div:has(p[id=Y i])"` and `"p:blank"`. These selectors aim to demonstrate the capabilities of the library to handle various matching criteria. +Two CSS selector strings are defined: `".x, div:has(p[id=Y i])"` and +`"p:blank"`. These selectors aim to demonstrate the capabilities of the library +to handle various matching criteria. ### Document Creation and Parsing -The code begins by creating an HTML document using the function `lxb_html_document_create()`. It then parses the HTML content with `lxb_html_document_parse()`. If parsing fails (indicated by a non-OK status), the function exits, ensuring that subsequent operations are performed on a valid document. +The code begins by creating an HTML document using the function +`lxb_html_document_create()`. It then parses the HTML content with +`lxb_html_document_parse()`. If parsing fails (indicated by a non-OK status), +the function exits, ensuring that subsequent operations are performed on a valid +document. ```c document = lxb_html_document_create(); @@ -23,7 +40,10 @@ status = lxb_html_document_parse(document, html, sizeof(html) / sizeof(lxb_char_ ### Memory Management -Proper memory management is crucial in C programming. The code allocates memory for parsed structures using `lxb_css_memory_create()`, initializing it with a specified size. This guarantees that the structures can be populated without running into memory issues. +Proper memory management is crucial in C programming. The code allocates memory +for parsed structures using `lxb_css_memory_create()`, initializing it with a +specified size. This guarantees that the structures can be populated without +running into memory issues. ```c memory = lxb_css_memory_create(); @@ -32,9 +52,14 @@ status = lxb_css_memory_init(memory, 128); ### CSS Parser and Selector Setup -A CSS parser is created with `lxb_css_parser_create()`, and its settings are adjusted to work with the previously created memory. The CSS selectors are set up with `lxb_css_selectors_create()` and initialized, ensuring that they can efficiently handle subsequent parsing requests. +A CSS parser is created with `lxb_css_parser_create()`, and its settings are +adjusted to work with the previously created memory. The CSS selectors are set +up with `lxb_css_selectors_create()` and initialized, ensuring that they can +efficiently handle subsequent parsing requests. -Important to note is the line where the parser is instructed not to create a new selector object for each call, thereby enhancing performance during parsing iterations: +Important to note is the line where the parser is instructed not to create a new +selector object for each call, thereby enhancing performance during parsing +iterations: ```c lxb_css_parser_selectors_set(parser, css_selectors); @@ -42,13 +67,21 @@ lxb_css_parser_selectors_set(parser, css_selectors); ### Selector Parsing and Serialization -The selectors defined earlier are parsed using `lxb_css_selectors_parse()`. The resulting lists (`list_one` and `list_two`) contain the parsed representations of the selectors. If parsing fails, the program exits gracefully. +The selectors defined earlier are parsed using `lxb_css_selectors_parse()`. The +resulting lists (`list_one` and `list_two`) contain the parsed representations +of the selectors. If parsing fails, the program exits gracefully. -After parsing, the example demonstrates HTML serialization through `lxb_html_serialize_pretty_deep_cb()` and outputs the selectors using `lxb_css_selector_serialize_list_chain()`, allowing for a visual check of the parsed structures. +After parsing, the example demonstrates HTML serialization through +`lxb_html_serialize_pretty_deep_cb()` and outputs the selectors using +`lxb_css_selector_serialize_list_chain()`, allowing for a visual check of the +parsed structures. ### Finding Nodes by Selectors -The example then proceeds to find HTML nodes using the parsed selectors. It leverages the `lxb_selectors_find()` function, along with a callback function `find_callback`, to process each matching node. This function simply counts the nodes found and prints their representation. +The example then proceeds to find HTML nodes using the parsed selectors. It +leverages the `lxb_selectors_find()` function, along with a callback function +`find_callback`, to process each matching node. This function simply counts the +nodes found and prints their representation. ```c status = lxb_selectors_find(selectors, body, list_one, find_callback, &count); @@ -56,7 +89,9 @@ status = lxb_selectors_find(selectors, body, list_one, find_callback, &count); ### Cleanup and Memory Deallocation -Once all operations are completed, the code carefully deallocates all allocated resources to prevent memory leaks. It uses the appropriate destroy functions for each created object, adhering to good practices in C coding. +Once all operations are completed, the code carefully deallocates all allocated +resources to prevent memory leaks. It uses the appropriate destroy functions for +each created object, adhering to good practices in C coding. ```c (void) lxb_selectors_destroy(selectors, true); @@ -66,4 +101,8 @@ Once all operations are completed, the code carefully deallocates all allocated ## Conclusion -In summary, this example outlines a practical implementation of HTML and CSS handling using the Lexbor library. It emphasizes the importance of robust memory management, selector parsing, and node finding functionalities, making it a valuable reference for developers looking to understand or utilize Lexbor in their projects. \ No newline at end of file +In summary, this example outlines a practical implementation of HTML and CSS +handling using the Lexbor library. It emphasizes the importance of robust memory +management, selector parsing, and node finding functionalities, making it a +valuable reference for developers looking to understand or utilize Lexbor in +their projects. \ No newline at end of file diff --git a/source/examples/selectors/unique_nodes.md b/source/examples/selectors/unique_nodes.md index 2165c99..88697d8 100644 --- a/source/examples/selectors/unique_nodes.md +++ b/source/examples/selectors/unique_nodes.md @@ -1,41 +1,85 @@ # CSS Selectors and HTML Node Selection Example -This article discusses the functionality of the `unique_nodes.c` source file, which implements a basic example of parsing HTML and CSS selectors using the lexbor library. The example illustrates how to create an HTML document, parse CSS selectors, and find nodes within the document that match those selectors. +This article discusses the functionality of the `unique_nodes.c` source file, +which implements a basic example of parsing HTML and CSS selectors using the +lexbor library. The example illustrates how to create an HTML document, parse +CSS selectors, and find nodes within the document that match those selectors. ## Key Components ### HTML and CSS Data -At the beginning of the main function, HTML and CSS data are defined. The HTML consists of a `

` containing two `

` elements, while the CSS contains several selectors, including class selectors, id selectors, and pseudo-class selectors. This data is crucial as it lays the groundwork for the subsequent parsing and node selection processes. +At the beginning of the main function, HTML and CSS data are defined. The HTML +consists of a `

` containing two `

` elements, while the CSS contains +several selectors, including class selectors, id selectors, and pseudo-class +selectors. This data is crucial as it lays the groundwork for the subsequent +parsing and node selection processes. ### Creating an HTML Document -The code then creates an HTML document using `lxb_html_document_create()` and populates it with the previously defined HTML data. The `lxb_html_document_parse()` function is called to parse the HTML data into a structured format. If parsing fails, the program exits with a failure status. This step transforms the provided HTML string into a DOM (Document Object Model) representation that can be interacted with programmatically. +The code then creates an HTML document using `lxb_html_document_create()` and +populates it with the previously defined HTML data. The +`lxb_html_document_parse()` function is called to parse the HTML data into a +structured format. If parsing fails, the program exits with a failure status. +This step transforms the provided HTML string into a DOM (Document Object Model) +representation that can be interacted with programmatically. ### Creating a CSS Parser -Following the creation of the HTML document, a CSS parser is instantiated with `lxb_css_parser_create()`. This is complemented by an initialization call to `lxb_css_parser_init()`. The parser is necessary for interpreting the CSS selectors provided in the string format. The proper functioning of the parsing depends on successful initialization, and any failure at this stage leads to an exit. +Following the creation of the HTML document, a CSS parser is instantiated with +`lxb_css_parser_create()`. This is complemented by an initialization call to +`lxb_css_parser_init()`. The parser is necessary for interpreting the CSS +selectors provided in the string format. The proper functioning of the parsing +depends on successful initialization, and any failure at this stage leads to an +exit. ### CSS Selector Processing -A CSS selector object is created using `lxb_css_selectors_create()`, and similarly initialized to prepare for parsing operations. It is important to note that the program avoids creating new selector objects each time the parser is called by setting the CSS selectors on the parser with `lxb_css_parser_selectors_set()`. This optimization ensures efficient memory usage and performance. +A CSS selector object is created using `lxb_css_selectors_create()`, and +similarly initialized to prepare for parsing operations. It is important to note +that the program avoids creating new selector objects each time the parser is +called by setting the CSS selectors on the parser with +`lxb_css_parser_selectors_set()`. This optimization ensures efficient memory +usage and performance. ### Parsing the Selectors -The CSS selectors are parsed using `lxb_css_selectors_parse()`, which generates a list of selectors ready for matching with the document's nodes. If parsing fails, the program exits. This list is critical for the next steps, allowing the program to identify nodes that match the defined selectors. +The CSS selectors are parsed using `lxb_css_selectors_parse()`, which generates +a list of selectors ready for matching with the document's nodes. If parsing +fails, the program exits. This list is critical for the next steps, allowing the +program to identify nodes that match the defined selectors. ### Serializing HTML and Selectors -The program outputs the serialized format of the HTML document using `lxb_html_serialize_pretty_deep_cb()`, which calls a callback function to print each node. This is useful for visual verification of the document structure. Similarly, the selectors are serialized with `lxb_css_selector_serialize_list_chain()`, enabling the user to see which selectors have been parsed and are ready for matching. +The program outputs the serialized format of the HTML document using +`lxb_html_serialize_pretty_deep_cb()`, which calls a callback function to print +each node. This is useful for visual verification of the document structure. +Similarly, the selectors are serialized with +`lxb_css_selector_serialize_list_chain()`, enabling the user to see which +selectors have been parsed and are ready for matching. ### Finding HTML Nodes -The core functionality of this example is encapsulated in the `lxb_selectors_find()` function, which takes the selectors and attempts to match them against the nodes in the document's body. A callback function, `find_callback`, is provided to handle each found node, incrementing a count and processing each matched node individually. If any part of this process fails, the program suitably returns an error status. +The core functionality of this example is encapsulated in the +`lxb_selectors_find()` function, which takes the selectors and attempts to match +them against the nodes in the document's body. A callback function, +`find_callback`, is provided to handle each found node, incrementing a count and +processing each matched node individually. If any part of this process fails, +the program suitably returns an error status. ### Cleanup -Finally, the program ensures that all allocated resources are correctly disposed of. Various destroy functions are called for the selectors, CSS parser, and the HTML document to prevent memory leaks. This step is essential in any robust application to maintain system performance and reliability. +Finally, the program ensures that all allocated resources are correctly disposed +of. Various destroy functions are called for the selectors, CSS parser, and the +HTML document to prevent memory leaks. This step is essential in any robust +application to maintain system performance and reliability. ## Conclusion -The `unique_nodes.c` example illustrates a practical application of the lexbor library to handle HTML documents and CSS selectors. By showcasing the entire lifecycle from parsing HTML to finding nodes based on CSS selectors, this example serves as an informative foundation for developers looking to work with document structures and styles in C using the lexbor library. The implemented logic emphasizes efficiency and clarity, ensuring that the handling of selectors and nodes is both effective and straightforward. \ No newline at end of file +The `unique_nodes.c` example illustrates a practical application of the lexbor +library to handle HTML documents and CSS selectors. By showcasing the entire +lifecycle from parsing HTML to finding nodes based on CSS selectors, this +example serves as an informative foundation for developers looking to work with +document structures and styles in C using the lexbor library. The implemented +logic emphasizes efficiency and clarity, ensuring that the handling of selectors +and nodes is both effective and straightforward. \ No newline at end of file diff --git a/source/examples/styles/attribute_style.md b/source/examples/styles/attribute_style.md index 8f04fb9..7fa23db 100644 --- a/source/examples/styles/attribute_style.md +++ b/source/examples/styles/attribute_style.md @@ -1,42 +1,74 @@ # CSS Style Attribute Example -This article provides an in-depth explanation of a code example found in the [lexbor/styles/attribute_style.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/attribute_style.c) file. The purpose of this code is to demonstrate how to create an HTML document, parse a specific HTML element, retrieve its CSS style properties, and then serialize those properties for output. +This article provides an in-depth explanation of a code example found in the +[lexbor/styles/attribute_style.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/attribute_style.c) +file. The purpose of this code is to demonstrate how to create an HTML document, +parse a specific HTML element, retrieve its CSS style properties, and then +serialize those properties for output. ## Code Breakdown ### Header Files and Function Definition -The code begins with necessary includes, specifically `base.h`, along with lexbor's HTML and CSS header files. This setup ensures that all necessary functions related to HTML document handling and CSS processing are available. +The code begins with necessary includes, specifically `base.h`, along with +lexbor's HTML and CSS header files. This setup ensures that all necessary +functions related to HTML document handling and CSS processing are available. -The `callback` function serves as a utility to print CSS property declarations. It takes a character pointer `data`, the length of data `len`, and a context pointer `ctx`. It uses `printf` to output the string, formatting it based on the provided length. This function is fundamental for logging purposes throughout the serialization process. +The `callback` function serves as a utility to print CSS property declarations. +It takes a character pointer `data`, the length of data `len`, and a context +pointer `ctx`. It uses `printf` to output the string, formatting it based on the +provided length. This function is fundamental for logging purposes throughout +the serialization process. ### Main Function The `main` function is where the primary logic occurs: -1. **Document Creation**: - The first step is to create a new HTML document using `lxb_html_document_create()`. If the document fails to create, it reports an error and halts execution using the `FAILED` macro. +1. **Document Creation**: The first step is to create a new HTML document using + `lxb_html_document_create()`. If the document fails to create, it reports an + error and halts execution using the `FAILED` macro. -2. **CSS Initialization**: - Following document creation, `lxb_html_document_css_init(doc)` initializes the CSS environment for the document. Again, a failure results in termination. +2. **CSS Initialization**: Following document creation, + `lxb_html_document_css_init(doc)` initializes the CSS environment for the + document. Again, a failure results in termination. -3. **HTML Parsing**: - The code employs `lxb_html_document_parse(doc, html.data, html.length)` to parse a static HTML string that contains a `

` with CSS inline styles. The inline styles include various widths and heights in different units. This parsing step builds the DOM structure of the HTML. +3. **HTML Parsing**: The code employs `lxb_html_document_parse(doc, html.data, + html.length)` to parse a static HTML string that contains a `
` with CSS + inline styles. The inline styles include various widths and heights in + different units. This parsing step builds the DOM structure of the HTML. -4. **Element Retrieval**: - A `lxb_dom_collection_t` is initialized to hold results. The function `lxb_dom_node_by_tag_name()` retrieves elements by their tag name, specifically targeting the `
` tag. If retrieval fails, execution is halted. +4. **Element Retrieval**: A `lxb_dom_collection_t` is initialized to hold + results. The function `lxb_dom_node_by_tag_name()` retrieves elements by + their tag name, specifically targeting the `
` tag. If retrieval fails, + execution is halted. -5. **CSS Property Access**: - The example seeks to extract specific style properties from the `
`. It retrieves the `width` property by name and the `height` property by its corresponding ID using `lxb_html_element_style_by_name` and `lxb_html_element_style_by_id`, respectively. Errors during this stage lead to failure messages. +5. **CSS Property Access**: The example seeks to extract specific style + properties from the `
`. It retrieves the `width` property by name and + the `height` property by its corresponding ID using + `lxb_html_element_style_by_name` and `lxb_html_element_style_by_id`, + respectively. Errors during this stage lead to failure messages. ### Serialization and Output -After acquiring the width and height styles, the example moves to serialize these properties. The `lxb_css_rule_declaration_serialize()` function is called twice, once for each property, passing the `callback` function to handle output. The results are printed to the console, showcasing the values for both properties. +After acquiring the width and height styles, the example moves to serialize +these properties. The `lxb_css_rule_declaration_serialize()` function is called +twice, once for each property, passing the `callback` function to handle output. +The results are printed to the console, showcasing the values for both +properties. ### Cleanup -The `lxb_dom_collection_destroy()` function cleans up the DOM collection used to store the `
` elements, while `lxb_html_document_destroy(doc)` releases the memory allocated for the document. This cleanup ensures no memory leaks occur during program execution. +The `lxb_dom_collection_destroy()` function cleans up the DOM collection used to +store the `
` elements, while `lxb_html_document_destroy(doc)` releases the +memory allocated for the document. This cleanup ensures no memory leaks occur +during program execution. ## Conclusion -This code example illustrates how to manipulate and retrieve CSS properties from an HTML element using the lexbor library. It covers creating an HTML document, parsing content, accessing specific elements, and outputting style properties, providing a comprehensive look at handling HTML and CSS in C with lexbor. The example highlights the importance of proper resource management and error reporting within such operations, which is essential for building robust applications. \ No newline at end of file +This code example illustrates how to manipulate and retrieve CSS properties from +an HTML element using the lexbor library. It covers creating an HTML document, +parsing content, accessing specific elements, and outputting style properties, +providing a comprehensive look at handling HTML and CSS in C with lexbor. The +example highlights the importance of proper resource management and error +reporting within such operations, which is essential for building robust +applications. \ No newline at end of file diff --git a/source/examples/styles/events_insert.md b/source/examples/styles/events_insert.md index 9b18371..c436294 100644 --- a/source/examples/styles/events_insert.md +++ b/source/examples/styles/events_insert.md @@ -1,16 +1,23 @@ # Events Insert Example -This article explains the C code found in [lexbor/styles/events_insert.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/events_insert.c), which demonstrates the process of manipulating HTML documents and applying CSS styles using the Lexbor library. The code operates on a simple HTML structure and applies specific styles based on a CSS stylesheet. +This article explains the C code found in +[lexbor/styles/events_insert.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/events_insert.c), +which demonstrates the process of manipulating HTML documents and applying CSS +styles using the Lexbor library. The code operates on a simple HTML structure +and applies specific styles based on a CSS stylesheet. ## Overview -The provided code initializes an HTML document representation, parses a predefined HTML string, applies a CSS stylesheet, and manipulates the DOM to insert a new HTML element. Here's a breakdown of the major sections of the code. +The provided code initializes an HTML document representation, parses a +predefined HTML string, applies a CSS stylesheet, and manipulates the DOM to +insert a new HTML element. Here's a breakdown of the major sections of the code. ## Code Breakdown ### Includes and Definitions -The code begins with the inclusion of necessary header files from the Lexbor library, which are essential for HTML, CSS, and selector functionalities: +The code begins with the inclusion of necessary header files from the Lexbor +library, which are essential for HTML, CSS, and selector functionalities: ```c #include @@ -18,11 +25,13 @@ The code begins with the inclusion of necessary header files from the Lexbor lib #include ``` -These headers allow access to functions and data structures needed to create and manipulate HTML and CSS documents. +These headers allow access to functions and data structures needed to create and +manipulate HTML and CSS documents. ### Callback Function -A callback function named `callback` is implemented to handle data output when invoked. This function prints data received from serialized output processes: +A callback function named `callback` is implemented to handle data output when +invoked. This function prints data received from serialized output processes: ```c lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { @@ -31,18 +40,21 @@ lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { } ``` -Its purpose is to print formatted strings, assisting in visual output of the document processes. +Its purpose is to print formatted strings, assisting in visual output of the +document processes. ### Main Function -The `main` function encapsulates the program logic. It starts by defining various variables and static data for HTML and CSS. +The `main` function encapsulates the program logic. It starts by defining +various variables and static data for HTML and CSS. ```c static const lexbor_str_t html = lexbor_str("
...
"); static const lexbor_str_t slctrs = lexbor_str("div.father {...}"); ``` -Here, `html` contains a `
` with class "father" and some child elements, while `slctrs` defines CSS rules for styling the div and its child paragraphs. +Here, `html` contains a `
` with class "father" and some child elements, +while `slctrs` defines CSS rules for styling the div and its child paragraphs. ### Document Creation and Parsing @@ -58,7 +70,8 @@ The document is then parsed with the defined HTML string: status = lxb_html_document_parse(document, html.data, html.length); ``` -If any operation fails, the program exits to ensure that no subsequent operations are performed on an invalid document structure. +If any operation fails, the program exits to ensure that no subsequent +operations are performed on an invalid document structure. ### CSS Initialization and Parsing @@ -68,46 +81,61 @@ Next, the code initializes the CSS subsystem of the document: status = lxb_html_document_css_init(document); ``` -After this initialization, a CSS parser is created and initialized. The CSS stylesheet is parsed and attached to the HTML document: +After this initialization, a CSS parser is created and initialized. The CSS +stylesheet is parsed and attached to the HTML document: ```c sst = lxb_css_stylesheet_parse(parser, slctrs.data, slctrs.length); status = lxb_html_document_stylesheet_attach(document, sst); ``` -At this stage, all elements in the document receive styles defined in the stylesheet. +At this stage, all elements in the document receive styles defined in the +stylesheet. ### Element Creation and Attribute Setting -The code then seeks to manipulate the DOM by creating a new paragraph element (`

`). This process involves setting attributes that apply styles from the stylesheet: +The code then seeks to manipulate the DOM by creating a new paragraph element +(`

`). This process involves setting attributes that apply styles from the +stylesheet: ```c np = lxb_html_document_create_element(document, p_str.data, p_str.length, NULL); attr = lxb_dom_element_set_attribute(lxb_dom_interface_element(np), class_str.data, class_str.length, best_str.data, best_str.length); ``` -Here, the element is given a class of "best" for styling purposes, followed by another attribute for inline styling. +Here, the element is given a class of "best" for styling purposes, followed by +another attribute for inline styling. ### Inserting the New Element -Once the new element is fully prepared with the appropriate attributes, it is appended to the "father" div: +Once the new element is fully prepared with the appropriate attributes, it is +appended to the "father" div: ```c lxb_html_element_insert_child(div, np); ``` -This action makes it part of the document's tree structure, and consequently, it inherits styling based on CSS rules. +This action makes it part of the document's tree structure, and consequently, it +inherits styling based on CSS rules. ### Final Serialization and Resource Cleanup -The program serializes the new element and produces output that reflects the changes made: +The program serializes the new element and produces output that reflects the +changes made: ```c status = lxb_html_serialize_cb(lxb_dom_interface_node(np), callback, NULL); ``` -Finally, all allocated resources are cleaned up to prevent memory leaks by destroying collections, stylesheets, and the document itself. +Finally, all allocated resources are cleaned up to prevent memory leaks by +destroying collections, stylesheets, and the document itself. ## Conclusion -The code in [lexbor/styles/events_insert.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/events_insert.c) illustrates an effective use of the Lexbor library to manipulate HTML and apply CSS. By parsing, creating elements, setting attributes, and attaching styles, it provides a clear example of dynamic document editing and processing. This showcases both the capabilities and convenience of the Lexbor framework in handling web technologies programmatically. \ No newline at end of file +The code in +[lexbor/styles/events_insert.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/events_insert.c) +illustrates an effective use of the Lexbor library to manipulate HTML and apply +CSS. By parsing, creating elements, setting attributes, and attaching styles, it +provides a clear example of dynamic document editing and processing. This +showcases both the capabilities and convenience of the Lexbor framework in +handling web technologies programmatically. \ No newline at end of file diff --git a/source/examples/styles/index.md b/source/examples/styles/index.md index 282f65f..71310b0 100644 --- a/source/examples/styles/index.md +++ b/source/examples/styles/index.md @@ -1,6 +1,7 @@ # Styles Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 1 diff --git a/source/examples/styles/stylesheet.md b/source/examples/styles/stylesheet.md index 03999fc..d303898 100644 --- a/source/examples/styles/stylesheet.md +++ b/source/examples/styles/stylesheet.md @@ -1,16 +1,28 @@ # CSS Stylesheet Parsing and Application Example -In this article, we will explore the implementation of CSS stylesheet parsing and application to HTML elements using the Lexbor library. The following example is derived from the source file [lexbor/styles/stylesheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/stylesheet.c). The code illustrates how to create an HTML document, parse CSS styles, attach these styles to the HTML document, and finally retrieve and serialize specific style declarations from an element. +In this article, we will explore the implementation of CSS stylesheet parsing +and application to HTML elements using the Lexbor library. The following example +is derived from the source file +[lexbor/styles/stylesheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/stylesheet.c). +The code illustrates how to create an HTML document, parse CSS styles, attach +these styles to the HTML document, and finally retrieve and serialize specific +style declarations from an element. ## Overview -The core of the example revolves around creating a minimal HTML document that contains a `

` element with inline CSS styles. The code then initializes the Lexbor HTML and CSS parsers, processes the provided CSS, and attaches the styles to the HTML document. Finally, it retrieves specific CSS properties (width and height) from the `
` element and serializes them for output. +The core of the example revolves around creating a minimal HTML document that +contains a `
` element with inline CSS styles. The code then initializes the +Lexbor HTML and CSS parsers, processes the provided CSS, and attaches the styles +to the HTML document. Finally, it retrieves specific CSS properties (width and +height) from the `
` element and serializes them for output. ## Code Breakdown ### Creating the HTML Document -Initially, the program creates an HTML document by calling `lxb_html_document_create()`. If the document creation fails, it triggers a failure message: +Initially, the program creates an HTML document by calling +`lxb_html_document_create()`. If the document creation fails, it triggers a +failure message: ```c doc = lxb_html_document_create(); @@ -19,7 +31,8 @@ if (doc == NULL) { } ``` -This part is crucial as it establishes a context for parsing HTML and applying styles. +This part is crucial as it establishes a context for parsing HTML and applying +styles. ### Initializing the CSS Parser @@ -32,11 +45,13 @@ if (status != LXB_STATUS_OK) { } ``` -Proper initialization allows the program to manage CSS styles associated with the HTML document confidently. +Proper initialization allows the program to manage CSS styles associated with +the HTML document confidently. ### Parsing the CSS Stylesheet -The CSS stylesheet is then created and parsed. The process involves instantiating a CSS parser with: +The CSS stylesheet is then created and parsed. The process involves +instantiating a CSS parser with: ```c parser = lxb_css_parser_create(); @@ -46,7 +61,9 @@ if (status != LXB_STATUS_OK) { } ``` -Once the parser is initialized, the `lxb_css_stylesheet_parse()` function gets called to parse the provided CSS string, which contains styling rules for the `
`: +Once the parser is initialized, the `lxb_css_stylesheet_parse()` function gets +called to parse the provided CSS string, which contains styling rules for the +`
`: ```c sst = lxb_css_stylesheet_parse(parser, css.data, css.length); @@ -55,7 +72,8 @@ if (sst == NULL) { } ``` -Successfully parsing the stylesheet is essential for associating styles with the HTML elements. +Successfully parsing the stylesheet is essential for associating styles with the +HTML elements. ### Parsing the HTML Document @@ -68,7 +86,8 @@ if (status != LXB_STATUS_OK) { } ``` -This transformation processes the HTML string into a structure that can be navigated and manipulated. +This transformation processes the HTML string into a structure that can be +navigated and manipulated. ### Attaching the Stylesheet @@ -85,7 +104,8 @@ This attachment allows the styles to take effect when querying elements. ### Retrieving Element Styles -To get the styles applied to the `
`, the code initializes a collection to store the gathered elements: +To get the styles applied to the `
`, the code initializes a collection to +store the gathered elements: ```c memset(&collection, 0, sizeof(lxb_dom_collection_t)); @@ -97,7 +117,9 @@ if (status != LXB_STATUS_OK) { } ``` -By calling `lxb_dom_node_by_tag_name()`, the program fetches the `
` element, which is then referenced to retrieve style declarations for specific properties: +By calling `lxb_dom_node_by_tag_name()`, the program fetches the `
` +element, which is then referenced to retrieve style declarations for specific +properties: ```c width = lxb_html_element_style_by_name(lxb_html_interface_element(div), @@ -106,11 +128,14 @@ height = lxb_html_element_style_by_id(lxb_html_interface_element(div), LXB_CSS_PROPERTY_HEIGHT); ``` -This logic effectively retrieves both width and height style settings applied to the element. +This logic effectively retrieves both width and height style settings applied to +the element. ### Serializing Styles -To output the retrieved styles, the code serializes each one using the `lxb_css_rule_declaration_serialize()` function, which takes a callback function to handle the output: +To output the retrieved styles, the code serializes each one using the +`lxb_css_rule_declaration_serialize()` function, which takes a callback function +to handle the output: ```c status = lxb_css_rule_declaration_serialize(width, callback, NULL); @@ -121,7 +146,8 @@ Here, the `callback` function simply prints the CSS properties to the console. ### Cleanup -As part of good coding practice, the program ends by freeing allocated resources, ensuring there are no memory leaks: +As part of good coding practice, the program ends by freeing allocated +resources, ensuring there are no memory leaks: ```c (void) lxb_dom_collection_destroy(&collection, false); @@ -132,4 +158,8 @@ As part of good coding practice, the program ends by freeing allocated resources ## Conclusion -The presented example demonstrates the process of parsing and applying CSS styles to an HTML document using the Lexbor library. By following through each part of the code, one can gain insights into how to effectively manage CSS properties within a structured HTML environment, allowing for flexible design and styling in modern web applications. \ No newline at end of file +The presented example demonstrates the process of parsing and applying CSS +styles to an HTML document using the Lexbor library. By following through each +part of the code, one can gain insights into how to effectively manage CSS +properties within a structured HTML environment, allowing for flexible design +and styling in modern web applications. \ No newline at end of file diff --git a/source/examples/styles/walk.md b/source/examples/styles/walk.md index cf3f551..32b79b4 100644 --- a/source/examples/styles/walk.md +++ b/source/examples/styles/walk.md @@ -1,52 +1,85 @@ # CSS Style Walking Example -This article explains the functionality and structure of the code found in [lexbor/styles/walk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/walk.c). The example focuses on parsing an HTML document, attaching CSS styles to an element, and traversing the applied styles. The primary goal of this example is to demonstrate how to manipulate the Document Object Model (DOM) and apply CSS styling in the Lexbor library. +This article explains the functionality and structure of the code found in +[lexbor/styles/walk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/walk.c). +The example focuses on parsing an HTML document, attaching CSS styles to an +element, and traversing the applied styles. The primary goal of this example is +to demonstrate how to manipulate the Document Object Model (DOM) and apply CSS +styling in the Lexbor library. ## Overview of the Code -The provided code is organized into several key sections. Each section serves a significant purpose within the program, which includes parsing HTML, creating a CSS parser, and navigating through the styles associated with specific HTML elements. +The provided code is organized into several key sections. Each section serves a +significant purpose within the program, which includes parsing HTML, creating a +CSS parser, and navigating through the styles associated with specific HTML +elements. ### Include Directives and Function Prototypes -The code begins by including essential header files from the Lexbor library, specifically for HTML and CSS functionalities. It defines two primary callback functions: +The code begins by including essential header files from the Lexbor library, +specifically for HTML and CSS functionalities. It defines two primary callback +functions: 1. **callback**: This function is executed to print serialized CSS data. -2. **walk_cb**: This function is intended to be called for each CSS style declaration when walking through the styles applied to an HTML element. +2. **walk_cb**: This function is intended to be called for each CSS style + declaration when walking through the styles applied to an HTML element. ### Main Functionality -The `main` function encompasses the workflow of the program, starting with the initialization of the HTML document and CSS objects. Here's a detailed breakdown of its sections: +The `main` function encompasses the workflow of the program, starting with the +initialization of the HTML document and CSS objects. Here's a detailed breakdown +of its sections: -1. **Document Creation**: - The code allocates memory for a new HTML document using `lxb_html_document_create()`. If it fails, the program exits with an error. +1. **Document Creation**: The code allocates memory for a new HTML document + using `lxb_html_document_create()`. If it fails, the program exits with an + error. -2. **CSS Initialization**: - The HTML document initiates its CSS functionality through `lxb_html_document_css_init()`. Similar to document creation, any failure leads to program termination. +2. **CSS Initialization**: The HTML document initiates its CSS functionality + through `lxb_html_document_css_init()`. Similar to document creation, any + failure leads to program termination. -3. **HTML Parsing**: - The program parses a static HTML string containing a `
` element using `lxb_html_document_parse()`. Again, error handling ensures that the program only proceeds if parsing is successful. +3. **HTML Parsing**: The program parses a static HTML string containing a + `
` element using `lxb_html_document_parse()`. Again, error handling + ensures that the program only proceeds if parsing is successful. -4. **CSS Parsing**: - A CSS parser is created and initialized. The program then attempts to parse a set of CSS selectors and styles. Successful parsing leads to the association of the stylesheet with the HTML document. +4. **CSS Parsing**: A CSS parser is created and initialized. The program then + attempts to parse a set of CSS selectors and styles. Successful parsing leads + to the association of the stylesheet with the HTML document. -5. **DOM Node Selection**: - The program searches for HTML elements using the CSS class name through `lxb_dom_node_by_class_name()`. If no elements are found or if an error occurs, the program appropriately exits. +5. **DOM Node Selection**: The program searches for HTML elements using the CSS + class name through `lxb_dom_node_by_class_name()`. If no elements are found + or if an error occurs, the program appropriately exits. -6. **Style Walking**: - The function `lxb_html_element_style_walk()` is called to iterate over the styles applied to the `
` element selected earlier. The `walk_cb` function is employed as a callback, allowing printing of style information. +6. **Style Walking**: The function `lxb_html_element_style_walk()` is called to + iterate over the styles applied to the `
` element selected earlier. The + `walk_cb` function is employed as a callback, allowing printing of style + information. ### Walking Through Styles In the `walk_cb` callback function, several actions take place: -- The CSS rule declaration is serialized and printed using `lxb_css_rule_declaration_serialize()`. -- The name and value of each property in the style declaration are serialized and printed through `lxb_css_property_serialize_name()` and `lxb_css_property_serialize()`. This provides complete visibility into the CSS properties applied to the `
`. -- The specificity of each CSS rule, including various parameters that determine the importance and origin of the styles, is printed. +- The CSS rule declaration is serialized and printed using + `lxb_css_rule_declaration_serialize()`. +- The name and value of each property in the style declaration are serialized + and printed through `lxb_css_property_serialize_name()` and + `lxb_css_property_serialize()`. This provides complete visibility into the CSS + properties applied to the `
`. +- The specificity of each CSS rule, including various parameters that determine + the importance and origin of the styles, is printed. ### Resource Cleanup -Finally, the program ensures that all allocated resources are correctly destroyed using respective cleanup functions for DOM collections, stylesheets, parsers, and the HTML document itself. This step is crucial for preventing memory leaks and ensuring efficient resource management. +Finally, the program ensures that all allocated resources are correctly +destroyed using respective cleanup functions for DOM collections, stylesheets, +parsers, and the HTML document itself. This step is crucial for preventing +memory leaks and ensuring efficient resource management. ## Conclusion -This code example highlights the integration of HTML parsing and CSS styling using the Lexbor library. By utilizing the provided functions and callback methods, developers can effectively manipulate and inspect styles associated with HTML elements. The careful arrangement of initialization, parsing, walking through styles, and resource cleanup demonstrates best practices in managing dynamic web content. \ No newline at end of file +This code example highlights the integration of HTML parsing and CSS styling +using the Lexbor library. By utilizing the provided functions and callback +methods, developers can effectively manipulate and inspect styles associated +with HTML elements. The careful arrangement of initialization, parsing, walking +through styles, and resource cleanup demonstrates best practices in managing +dynamic web content. \ No newline at end of file diff --git a/source/examples/unicode/idna_to_ascii.md b/source/examples/unicode/idna_to_ascii.md index 62586b0..41c4fac 100644 --- a/source/examples/unicode/idna_to_ascii.md +++ b/source/examples/unicode/idna_to_ascii.md @@ -1,38 +1,54 @@ # IDNA to ASCII Conversion Example -This document provides an explanation of the IDNA to ASCII conversion code example located in the [lexbor/unicode/idna_to_ascii.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/idna_to_ascii.c) source file. The code focuses on converting Internationalized Domain Names (IDN) from their Unicode representations to ASCII, which is often required for compatibility with DNS systems. +This document provides an explanation of the IDNA to ASCII conversion code +example located in the +[lexbor/unicode/idna_to_ascii.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/idna_to_ascii.c) +source file. The code focuses on converting Internationalized Domain Names (IDN) +from their Unicode representations to ASCII, which is often required for +compatibility with DNS systems. ## Overview -The program begins by initializing the necessary libraries and defining the main entry point. Central to the workflow is the utilization of the `lxb_unicode_idna_t` structure, which is responsible for handling the conversion process. The program reads data from standard input and manages memory dynamically to accommodate varying input sizes. +The program begins by initializing the necessary libraries and defining the main +entry point. Central to the workflow is the utilization of the +`lxb_unicode_idna_t` structure, which is responsible for handling the conversion +process. The program reads data from standard input and manages memory +dynamically to accommodate varying input sizes. ## Code Explanation ### Initialization -The program begins with include directives, where it imports the lexbor unicode library. The `callback` function is declared, which is used later in the code to process the results of the conversion. +The program begins with include directives, where it imports the lexbor unicode +library. The `callback` function is declared, which is used later in the code to +process the results of the conversion. -In the `main` function, the variables are declared, and critical initialization occurs: +In the `main` function, the variables are declared, and critical initialization +occurs: ```c status = lxb_unicode_idna_init(&idna); ``` -Here, `lxb_unicode_idna_init` initializes an IDNA object, and the program checks for successful initialization, exiting if it fails. +Here, `lxb_unicode_idna_init` initializes an IDNA object, and the program checks +for successful initialization, exiting if it fails. ### Memory Allocation -Memory allocation is handled using the `lexbor_malloc` function. The program allocates a buffer to read input data: +Memory allocation is handled using the `lexbor_malloc` function. The program +allocates a buffer to read input data: ```c buf = lexbor_malloc(sizeof(inbuf)); ``` -If memory allocation fails, the program gracefully handles the error by cleaning up resources and terminating. +If memory allocation fails, the program gracefully handles the error by cleaning +up resources and terminating. ### Input Processing Loop -The main processing loop reads data from standard input using `fread`. It checks for end-of-file conditions and also manages buffer overflows dynamically: +The main processing loop reads data from standard input using `fread`. It checks +for end-of-file conditions and also manages buffer overflows dynamically: ```c if (p + size > end) { @@ -40,11 +56,15 @@ if (p + size > end) { tmp = lexbor_realloc(buf, nsize); ``` -If additional space is needed in the buffer, the program reallocates memory to ensure there is sufficient room for incoming data, multiplying the existing size by three. This approach helps accommodate larger inputs without frequent reallocations. +If additional space is needed in the buffer, the program reallocates memory to +ensure there is sufficient room for incoming data, multiplying the existing size +by three. This approach helps accommodate larger inputs without frequent +reallocations. ### Handling Newline Characters -Before proceeding with the IDNA conversion, the program removes trailing newline and carriage return characters from the buffer: +Before proceeding with the IDNA conversion, the program removes trailing newline +and carriage return characters from the buffer: ```c if (p - buf > 0) { @@ -54,17 +74,22 @@ if (p - buf > 0) { } ``` -This ensures that the string sent for conversion does not include unwanted whitespace or end-of-line characters, which could potentially affect the conversion. +This ensures that the string sent for conversion does not include unwanted +whitespace or end-of-line characters, which could potentially affect the +conversion. ### IDNA Conversion -The core functionality of the program lies in the call to `lxb_unicode_idna_to_ascii`, which performs the actual conversion from Unicode to ASCII: +The core functionality of the program lies in the call to +`lxb_unicode_idna_to_ascii`, which performs the actual conversion from Unicode +to ASCII: ```c status = lxb_unicode_idna_to_ascii(&idna, buf, p - buf, callback, NULL, 0); ``` -This function takes the initialized IDNA object, the buffer of data, its length, and a callback function that will handle the output. +This function takes the initialized IDNA object, the buffer of data, its length, +and a callback function that will handle the output. ### Callback Function @@ -79,19 +104,29 @@ callback(const lxb_char_t *data, size_t len, void *ctx) } ``` -This function simply prints the converted ASCII data to the standard output. It receives the data generated by the conversion and its length, allowing it to format the output correctly. +This function simply prints the converted ASCII data to the standard output. It +receives the data generated by the conversion and its length, allowing it to +format the output correctly. ### Cleanup and Exit -Finally, the program ensures that all allocated resources are cleaned up correctly: +Finally, the program ensures that all allocated resources are cleaned up +correctly: ```c lexbor_free(buf); lxb_unicode_idna_destroy(&idna, false); ``` -The error handling also follows a similar pattern, ensuring that there are no memory leaks or dangling pointers by freeing up the allocated buffer and destroying the IDNA object. +The error handling also follows a similar pattern, ensuring that there are no +memory leaks or dangling pointers by freeing up the allocated buffer and +destroying the IDNA object. ## Conclusion -This IDNA to ASCII conversion example demonstrates important concepts related to memory management, input handling, and Unicode processing in C using the lexbor library. Through structured control flow and careful resource management, the program efficiently converts IDN input into a format compatible with traditional DNS systems. The use of callback functions helps in handling outputs dynamically, showcasing an effective design pattern in C programming. \ No newline at end of file +This IDNA to ASCII conversion example demonstrates important concepts related to +memory management, input handling, and Unicode processing in C using the lexbor +library. Through structured control flow and careful resource management, the +program efficiently converts IDN input into a format compatible with traditional +DNS systems. The use of callback functions helps in handling outputs +dynamically, showcasing an effective design pattern in C programming. \ No newline at end of file diff --git a/source/examples/unicode/index.md b/source/examples/unicode/index.md index 01af28a..c2c0840 100644 --- a/source/examples/unicode/index.md +++ b/source/examples/unicode/index.md @@ -1,6 +1,7 @@ # Unicode Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 1 diff --git a/source/examples/unicode/normalization_form.md b/source/examples/unicode/normalization_form.md index 13c5dd0..b2841ca 100644 --- a/source/examples/unicode/normalization_form.md +++ b/source/examples/unicode/normalization_form.md @@ -1,31 +1,51 @@ # Unicode Normalization Example -This article explains the example code found in the file [lexbor/unicode/normalization_form.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/normalization_form.c). The program demonstrates how to perform Unicode normalization using the Lexbor library, specifically focusing on four normalization forms: NFC, NFD, NFKC, and NFKD. +This article explains the example code found in the file +[lexbor/unicode/normalization_form.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/normalization_form.c). +The program demonstrates how to perform Unicode normalization using the Lexbor +library, specifically focusing on four normalization forms: NFC, NFD, NFKC, and +NFKD. ## Introduction -The code begins by including the necessary headers for Unicode functionality and encoding. It defines a Unicode string, `"ẛ̣"`, which consists of the code points `U+1E9B` (LATIN SMALL LETTER S WITH DOT ABOVE) and `U+0323` (COMBINING DOT BELOW). The program aims to normalize this string and print the results of each normalization form. +The code begins by including the necessary headers for Unicode functionality and +encoding. It defines a Unicode string, `"ẛ̣"`, which consists of the code points +`U+1E9B` (LATIN SMALL LETTER S WITH DOT ABOVE) and `U+0323` (COMBINING DOT +BELOW). The program aims to normalize this string and print the results of each +normalization form. ## Main Function -The `main` function is the entry point of the program. Here, a `lxb_unicode_normalizer_t` object is created with the function `lxb_unicode_normalizer_create()`. This object will be used to perform the normalization forms. The initialization of this object specifies the normalization form to use, starting with NFC (Normalization Form C). +The `main` function is the entry point of the program. Here, a +`lxb_unicode_normalizer_t` object is created with the function +`lxb_unicode_normalizer_create()`. This object will be used to perform the +normalization forms. The initialization of this object specifies the +normalization form to use, starting with NFC (Normalization Form C). ### Initialization -After the Unicode normalizer object is successfully created, it is initialized with NFC: +After the Unicode normalizer object is successfully created, it is initialized +with NFC: ```c status = lxb_unicode_normalizer_init(uc, LXB_UNICODE_NFC); ``` -If the initialization fails (`status != LXB_STATUS_OK`), an error message is printed, and the program exits with a failure status. Similar checks are made after each normalization operation to handle potential errors. +If the initialization fails (`status != LXB_STATUS_OK`), an error message is +printed, and the program exits with a failure status. Similar checks are made +after each normalization operation to handle potential errors. ## Normalization Operations -The code proceeds through each normalization form: NFC, NFD, NFKC, and NFKD. In each case, the following steps are performed: +The code proceeds through each normalization form: NFC, NFD, NFKC, and NFKD. In +each case, the following steps are performed: -1. Set the desired normalization form using `lxb_unicode_normalization_form_set(uc, ...)`. -2. Call `lxb_unicode_normalize(...)` to perform the normalization, passing the source string, its length, a callback function to handle the result, the name of the normalization form, and a boolean indicating whether the function should show its results. +1. Set the desired normalization form using + `lxb_unicode_normalization_form_set(uc, ...)`. +2. Call `lxb_unicode_normalize(...)` to perform the normalization, passing the + source string, its length, a callback function to handle the result, the name + of the normalization form, and a boolean indicating whether the function + should show its results. For instance, the NFC normalization is conducted as follows: @@ -33,22 +53,34 @@ For instance, the NFC normalization is conducted as follows: status = lxb_unicode_normalize(uc, source, sizeof(source) - 1, callback, "NFC", true); ``` -Each normalization form will produce a different output, reflecting how the Unicode string is represented under various normalization rules. The callback function processes the normalized output. +Each normalization form will produce a different output, reflecting how the +Unicode string is represented under various normalization rules. The callback +function processes the normalized output. ## Callback Function -The `callback` function accepts the normalized data, its length, and a context string (the name of the normalization form). Inside this function, the received data is processed to decode valid UTF-8 sequences. It utilizes the Lexbor function `lxb_encoding_decode_valid_utf_8_single()` to decode each character code point and print it in hexadecimal format. +The `callback` function accepts the normalized data, its length, and a context +string (the name of the normalization form). Inside this function, the received +data is processed to decode valid UTF-8 sequences. It utilizes the Lexbor +function `lxb_encoding_decode_valid_utf_8_single()` to decode each character +code point and print it in hexadecimal format. ### Printing the Results Here's how the function handles output: 1. It prints the name of the normalization being processed. -2. It enters a loop to decode and print each code point in hexadecimal format until all data is processed. +2. It enters a loop to decode and print each code point in hexadecimal format + until all data is processed. 3. Finally, it prints the original data in a string format for reference. ## Conclusion -After performing all normalization forms, the program cleans up by calling `lxb_unicode_normalizer_destroy(uc, true)` to free the allocated resources. It returns a success status, indicating that all operations were completed without errors. +After performing all normalization forms, the program cleans up by calling +`lxb_unicode_normalizer_destroy(uc, true)` to free the allocated resources. It +returns a success status, indicating that all operations were completed without +errors. -This example provides a practical approach to understanding how Unicode normalization works in the Lexbor library and demonstrates how to handle Unicode strings effectively. \ No newline at end of file +This example provides a practical approach to understanding how Unicode +normalization works in the Lexbor library and demonstrates how to handle Unicode +strings effectively. \ No newline at end of file diff --git a/source/examples/unicode/normalization_form_stdin.md b/source/examples/unicode/normalization_form_stdin.md index cfb482a..89a1ba4 100644 --- a/source/examples/unicode/normalization_form_stdin.md +++ b/source/examples/unicode/normalization_form_stdin.md @@ -1,29 +1,60 @@ # Unicode Normalization Form Example -This article describes the implementation found in the file [lexbor/unicode/normalization_form_stdin.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/normalization_form_stdin.c). The purpose of this code example is to read input from standard input (stdin), apply a specified Unicode normalization form, and print the normalized output. The program supports four normalization forms: NFC, NFD, NFKC, and NFKD. +This article describes the implementation found in the file +[lexbor/unicode/normalization_form_stdin.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/normalization_form_stdin.c). +The purpose of this code example is to read input from standard input (stdin), +apply a specified Unicode normalization form, and print the normalized output. +The program supports four normalization forms: NFC, NFD, NFKC, and NFKD. ## Overview of the Code -The code begins with necessary include statements and defines the structure for the callback function. Here's a breakdown of the main parts of the code: +The code begins with necessary include statements and defines the structure for +the callback function. Here's a breakdown of the main parts of the code: ### Main Function -The `main` function serves as the entry point of the program. Its operation includes: +The `main` function serves as the entry point of the program. Its operation +includes: -1. **Argument Handling**: It verifies that at least one argument is provided to specify the normalization form. If not, it directs the flow to a usage message. The accepted arguments are either "NFC" or "NFD" for three-character forms and "NFKC" or "NFKD" for four-character forms. +1. **Argument Handling**: It verifies that at least one argument is provided to + specify the normalization form. If not, it directs the flow to a usage + message. The accepted arguments are either "NFC" or "NFD" for three-character + forms and "NFKC" or "NFKD" for four-character forms. -2. **Normalization Form Selection**: Depending on the command line argument, the program sets the appropriate normalization form using a series of `if` statements that compare the input string. If none of the specified forms are matched, it again leads to the usage message. +2. **Normalization Form Selection**: Depending on the command line argument, the + program sets the appropriate normalization form using a series of `if` + statements that compare the input string. If none of the specified forms are + matched, it again leads to the usage message. -3. **Initialization of the Normalizer**: The Unicode normalizer is created with `lxb_unicode_normalizer_create()`, followed by its initialization using `lxb_unicode_normalizer_init()`. Upon failure to initialize, the program returns an error status. +3. **Initialization of the Normalizer**: The Unicode normalizer is created with + `lxb_unicode_normalizer_create()`, followed by its initialization using + `lxb_unicode_normalizer_init()`. Upon failure to initialize, the program + returns an error status. -4. **Reading Input and Normalization Loop**: The program then enters a loop where it reads data from stdin into an input buffer. Using `fread`, it checks if the end of the file (EOF) is reached or if an error occurs during reading. If data is read successfully, it passes the input to the normalization function `lxb_unicode_normalize()`, which applies the specified normalization using a callback function. +4. **Reading Input and Normalization Loop**: The program then enters a loop + where it reads data from stdin into an input buffer. Using `fread`, it checks + if the end of the file (EOF) is reached or if an error occurs during reading. + If data is read successfully, it passes the input to the normalization + function `lxb_unicode_normalize()`, which applies the specified normalization + using a callback function. -5. **Cleanup**: After processing, it cleans up by destroying the normalizer with `lxb_unicode_normalizer_destroy()`. +5. **Cleanup**: After processing, it cleans up by destroying the normalizer with + `lxb_unicode_normalizer_destroy()`. ### The Callback Function -The `callback` function is defined to handle the normalized output data. It takes the normalized data along with its length and prints it to the standard output. The format specifier `%.*s` is used to ensure that only the part of the buffer corresponding to the normalized data length is printed, handling potential null-termination issues gracefully. +The `callback` function is defined to handle the normalized output data. It +takes the normalized data along with its length and prints it to the standard +output. The format specifier `%.*s` is used to ensure that only the part of the +buffer corresponding to the normalized data length is printed, handling +potential null-termination issues gracefully. ## Conclusion -This example illustrates how to implement a basic command line utility for Unicode normalization using the lexbor library. It effectively demonstrates handling input, processing data with a normalization algorithm, and producing output. This utility can be useful in applications where consistent Unicode representation is crucial, such as in text processing and data interchange scenarios. Users can invoke the tool with specific normalization forms to transform their input accordingly. \ No newline at end of file +This example illustrates how to implement a basic command line utility for +Unicode normalization using the lexbor library. It effectively demonstrates +handling input, processing data with a normalization algorithm, and producing +output. This utility can be useful in applications where consistent Unicode +representation is crucial, such as in text processing and data interchange +scenarios. Users can invoke the tool with specific normalization forms to +transform their input accordingly. \ No newline at end of file diff --git a/source/examples/url/index.md b/source/examples/url/index.md index 364aa5d..d2f46a4 100644 --- a/source/examples/url/index.md +++ b/source/examples/url/index.md @@ -1,6 +1,7 @@ # URL Examples -These articles delve into some of the examples in more detail to help you use `lexbor` more effectively. +These articles delve into some of the examples in more detail to help you use +`lexbor` more effectively. ```{toctree} :maxdepth: 1 diff --git a/source/examples/url/parse.md b/source/examples/url/parse.md index b488149..834bd02 100644 --- a/source/examples/url/parse.md +++ b/source/examples/url/parse.md @@ -1,29 +1,44 @@ # URL Parsing Example -This article examines a code example from the [lexbor/url/parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/url/parse.c) file, focusing on URL parsing using the Lexbor library. The intent of this code is to demonstrate how to initialize the URL parser, parse a URL string, and subsequently serialize different components of the parsed URL, such as the scheme, username, password, host, and more. Each section of the code plays a critical role in handling URL data. +This article examines a code example from the +[lexbor/url/parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/url/parse.c) +file, focusing on URL parsing using the Lexbor library. The intent of this code +is to demonstrate how to initialize the URL parser, parse a URL string, and +subsequently serialize different components of the parsed URL, such as the +scheme, username, password, host, and more. Each section of the code plays a +critical role in handling URL data. ## Code Breakdown ### Initialization -The code begins by including the necessary header for the Lexbor URL library and defining a static callback function. In the `main` function, several variables are declared, including a pointer to `lxb_url_t`, an instance of `lxb_url_parser_t`, and `lxb_unicode_idna_t`. +The code begins by including the necessary header for the Lexbor URL library and +defining a static callback function. In the `main` function, several variables +are declared, including a pointer to `lxb_url_t`, an instance of +`lxb_url_parser_t`, and `lxb_unicode_idna_t`. ```c lxb_url_parser_t parser; lxb_unicode_idna_t idna; ``` -Here, `parser` is used to handle the URL parsing logic, while `idna` is utilized for Internationalized Domain Name (IDN) handling. +Here, `parser` is used to handle the URL parsing logic, while `idna` is utilized +for Internationalized Domain Name (IDN) handling. ### Parsing the URL -A static constant `url_str` initializes with a URL string containing various components, including a scheme (`https`), credentials (`panda:pass`), a domain name with Unicode characters, a port number (`2030`), a path, a query parameter, and a fragment. +A static constant `url_str` initializes with a URL string containing various +components, including a scheme (`https`), credentials (`panda:pass`), a domain +name with Unicode characters, a port number (`2030`), a path, a query parameter, +and a fragment. ```c static const lexbor_str_t url_str = lexbor_str("https://panda:pass@тест.com:2030/path/to/hell?id=54321#comments"); ``` -Next, the parser is initialized using the `lxb_url_parser_init` function. It is crucial to check the returned status to ensure that the parser was initialized successfully. +Next, the parser is initialized using the `lxb_url_parser_init` function. It is +crucial to check the returned status to ensure that the parser was initialized +successfully. ```c status = lxb_url_parser_init(&parser, NULL); @@ -33,11 +48,14 @@ if (status != LXB_STATUS_OK) { } ``` -If the parser fails to initialize, an error message is printed, and the program exits. +If the parser fails to initialize, an error message is printed, and the program +exits. ### Executing the Parse -The URL is parsed through `lxb_url_parse`, which processes the URL string into its various components. Again, it is crucial to validate that the parsing was successful by checking if `url` is `NULL`. +The URL is parsed through `lxb_url_parse`, which processes the URL string into +its various components. Again, it is crucial to validate that the parsing was +successful by checking if `url` is `NULL`. ```c url = lxb_url_parse(&parser, NULL, url_str.data, url_str.length); @@ -49,7 +67,9 @@ if (url == NULL) { ### Serializing URL Components -After successful parsing, the next step involves destroying the parser to clean up resources. The code then initializes the IDNA handler, which is necessary for the following serialization of Unicode hostnames. +After successful parsing, the next step involves destroying the parser to clean +up resources. The code then initializes the IDNA handler, which is necessary for +the following serialization of Unicode hostnames. ```c status = lxb_unicode_idna_init(&idna); @@ -59,23 +79,28 @@ if (status != LXB_STATUS_OK) { } ``` -The program outputs the original URL string and proceeds to serialize various parts of the URL. Each serialization function is linked to the previously defined `callback`, which handles the output for each component. +The program outputs the original URL string and proceeds to serialize various +parts of the URL. Each serialization function is linked to the previously +defined `callback`, which handles the output for each component. - **Serialized URL**: Outputs the entire URL. - **Scheme**: Extracts and displays only the scheme portion. - **Username and Password**: Collects and shows the relevant sections. - **Host**: Contains both ASCII and Unicode serialization capabilities. -- **Port, Path, Query, and Fragment**: Serializes these components in turn, showcasing all aspects of the URL. +- **Port, Path, Query, and Fragment**: Serializes these components in turn, + showcasing all aspects of the URL. ```c (void) lxb_url_serialize(url, callback, NULL, false); ``` -Each of these print statements utilizes the callback function to handle the printing of serialized data. +Each of these print statements utilizes the callback function to handle the +printing of serialized data. ### Cleanup -Finally, the program cleans up by destroying the IDNA handler and the allocated URL memory, ensuring that no resources are leaked. +Finally, the program cleans up by destroying the IDNA handler and the allocated +URL memory, ensuring that no resources are leaked. ```c (void) lxb_unicode_idna_destroy(&idna, false); @@ -84,4 +109,9 @@ Finally, the program cleans up by destroying the IDNA handler and the allocated ### Conclusion -The example succinctly demonstrates the capabilities of the Lexbor URL parsing library, showcasing how to initialize the parser, handle a complex URL with Unicode characters, and serialize its components. Each part of the code works harmoniously to show how flexible and powerful URL handling can be in modern C programming with the Lexbor library. The proper initialization, error handling, and cleanup are crucial for robust application development. \ No newline at end of file +The example succinctly demonstrates the capabilities of the Lexbor URL parsing +library, showcasing how to initialize the parser, handle a complex URL with +Unicode characters, and serialize its components. Each part of the code works +harmoniously to show how flexible and powerful URL handling can be in modern C +programming with the Lexbor library. The proper initialization, error handling, +and cleanup are crucial for robust application development. \ No newline at end of file diff --git a/source/examples/url/relative.md b/source/examples/url/relative.md index ae21b13..6c26995 100644 --- a/source/examples/url/relative.md +++ b/source/examples/url/relative.md @@ -1,12 +1,19 @@ # URL Parsing Example -This article provides an explanation of the URL parsing example found in the source file [lexbor/url/relative.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/url/relative.c). The example demonstrates the parsing of a relative URL based on a provided base URL using the lexbor library. It outlines the setup of the URL parser, handling of input strings, and the serialization of various components of the parsed URL. +This article provides an explanation of the URL parsing example found in the +source file +[lexbor/url/relative.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/url/relative.c). +The example demonstrates the parsing of a relative URL based on a provided base +URL using the lexbor library. It outlines the setup of the URL parser, handling +of input strings, and the serialization of various components of the parsed URL. ## Code Breakdown ### Initial Setup -The program begins by including necessary headers and defining the callback function. The callback function serves the purpose of printing parsed URL components. The main function contains the core logic where URL parsing occurs. +The program begins by including necessary headers and defining the callback +function. The callback function serves the purpose of printing parsed URL +components. The main function contains the core logic where URL parsing occurs. ```c static lxb_status_t @@ -15,18 +22,24 @@ callback(const lxb_char_t *data, size_t len, void *ctx); ### URL Initialization -In `main`, variables are defined for the base URL and the URL to parse. The lexbor string structures are initialized with `url_str` and `base_url_str`. The `lxb_url_parser_t parser` is initialized to set up the parser for processing the URLs. +In `main`, variables are defined for the base URL and the URL to parse. The +lexbor string structures are initialized with `url_str` and `base_url_str`. The +`lxb_url_parser_t parser` is initialized to set up the parser for processing the +URLs. ```c lxb_url_parser_t parser; status = lxb_url_parser_init(&parser, NULL); ``` -This initializes the parser and checks for successful initialization. If it fails, the program outputs an error message and exits. +This initializes the parser and checks for successful initialization. If it +fails, the program outputs an error message and exits. ### Parsing Base URL -The `base_url` is then parsed using `lxb_url_parse`, which takes the initialized parser, a null pointer (for context), the data of the base URL string, and its length. +The `base_url` is then parsed using `lxb_url_parse`, which takes the initialized +parser, a null pointer (for context), the data of the base URL string, and its +length. ```c base_url = lxb_url_parse(&parser, NULL, base_url_str.data, base_url_str.length); @@ -36,18 +49,23 @@ If parsing the base URL fails, an error message is printed. ### Cleaning Up and Parsing Relative URL -Subsequently, the parser is cleaned up, and the relative URL is parsed in a similar manner using the base URL as a reference. +Subsequently, the parser is cleaned up, and the relative URL is parsed in a +similar manner using the base URL as a reference. ```c lxb_url_parser_clean(&parser); url = lxb_url_parse(&parser, base_url, url_str.data, url_str.length); ``` -Again, if the parsing fails, an appropriate error message is printed. After the relative URL is successfully parsed, the parser must be cleaned up using `lxb_url_parser_destroy`. +Again, if the parsing fails, an appropriate error message is printed. After the +relative URL is successfully parsed, the parser must be cleaned up using +`lxb_url_parser_destroy`. ### Serializing URL Components -The main focus of this example is the serialization of various components of the parsed URL. Using callbacks, the program outputs the base URL, relative URL, and several segments of the parsed URL: +The main focus of this example is the serialization of various components of the +parsed URL. Using callbacks, the program outputs the base URL, relative URL, and +several segments of the parsed URL: - Scheme - Username @@ -58,17 +76,21 @@ The main focus of this example is the serialization of various components of the - Query - Fragment -Each of these components is printed by invoking serialization functions, such as `lxb_url_serialize_scheme` for the scheme, and so forth. +Each of these components is printed by invoking serialization functions, such as +`lxb_url_serialize_scheme` for the scheme, and so forth. ```c (void) lxb_url_serialize(url, callback, NULL, false); ``` -The callback function defined earlier is utilized here to display each component by printing its representation. +The callback function defined earlier is utilized here to display each component +by printing its representation. ### Final Cleanup -After displaying all URL components, the program cleans up the IDNA context and the memory associated with the parsed URL. This ensures that any resources utilized during the parsing are properly released. +After displaying all URL components, the program cleans up the IDNA context and +the memory associated with the parsed URL. This ensures that any resources +utilized during the parsing are properly released. ```c (void) lxb_unicode_idna_destroy(&idna, false); @@ -77,4 +99,8 @@ After displaying all URL components, the program cleans up the IDNA context and ### Conclusion -The provided example illustrates the process of relative URL parsing using the lexbor library. From initializing the parser to serializing specific components of the URL, each step is crucial for accurate URL handling in applications. The careful management of memory and resources also highlights best practices in programming with C. \ No newline at end of file +The provided example illustrates the process of relative URL parsing using the +lexbor library. From initializing the parser to serializing specific components +of the URL, each step is crucial for accurate URL handling in applications. The +careful management of memory and resources also highlights best practices in +programming with C. \ No newline at end of file From 1f23930010614099ca20ee015454f377186d003b Mon Sep 17 00:00:00 2001 From: Toxypi Date: Sat, 28 Sep 2024 10:22:28 +0100 Subject: [PATCH 6/9] Normalized lexbor naming. --- .spellcheck_ignore.txt | 4 ++-- source/articles/part-1-html.md | 6 +++--- source/examples/css/StyleSheet.md | 6 +++--- source/examples/css/selectors/list_easy_way.md | 2 +- source/examples/css/selectors/list_fast_way.md | 2 +- source/examples/css/syntax/structure_parse_file.md | 6 +++--- source/examples/css/syntax/tokenizer/chunks_stdin.md | 4 ++-- source/examples/css/syntax/tokenizer/from_file.md | 4 ++-- source/examples/css/syntax/tokenizer/print_raw.md | 6 +++--- source/examples/encoding/buffer/decode/decode.md | 4 ++-- source/examples/encoding/buffer/decode/decoder.md | 4 ++-- source/examples/encoding/buffer/decode/validate.md | 6 +++--- source/examples/encoding/buffer/encode/encode.md | 4 ++-- source/examples/encoding/buffer/encode/encoder.md | 2 +- source/examples/encoding/buffer/encode/validate.md | 6 +++--- source/examples/encoding/buffer/from_to.md | 2 +- source/examples/encoding/data_by_name.md | 8 ++++---- source/examples/encoding/single/decode/decode.md | 4 ++-- source/examples/encoding/single/decode/decoder.md | 6 +++--- source/examples/encoding/single/decode/validate.md | 6 +++--- source/examples/encoding/single/encode/encode.md | 8 ++++---- source/examples/encoding/single/encode/validate.md | 6 +++--- source/examples/encoding/single/from_to.md | 6 +++--- source/examples/html/document_parse.md | 4 ++-- source/examples/html/document_parse_chunk.md | 4 ++-- source/examples/html/document_title.md | 6 +++--- source/examples/html/element_attributes.md | 6 +++--- source/examples/html/element_create.md | 6 +++--- source/examples/html/element_innerHTML.md | 8 ++++---- source/examples/html/elements_by_attr.md | 4 ++-- source/examples/html/elements_by_class_name.md | 4 ++-- source/examples/html/elements_by_tag_name.md | 8 ++++---- source/examples/html/encoding.md | 6 +++--- source/examples/html/html2sexpr.md | 8 ++++---- source/examples/html/parse.md | 2 +- source/examples/html/parse_chunk.md | 6 +++--- source/examples/html/tokenizer/callback.md | 2 +- source/examples/html/tokenizer/simple.md | 2 +- source/examples/html/tokenizer/text.md | 8 ++++---- source/examples/punycode/decode.md | 4 ++-- source/examples/punycode/encode.md | 4 ++-- source/examples/selectors/easy_way.md | 6 +++--- source/examples/selectors/normal_way.md | 8 ++++---- source/examples/selectors/unique_nodes.md | 4 ++-- source/examples/styles/attribute_style.md | 2 +- source/examples/styles/events_insert.md | 8 ++++---- source/examples/styles/stylesheet.md | 6 +++--- source/examples/styles/walk.md | 6 +++--- source/examples/unicode/normalization_form.md | 6 +++--- source/examples/unicode/normalization_form_stdin.md | 2 +- source/examples/url/parse.md | 8 ++++---- source/examples/url/relative.md | 4 ++-- 52 files changed, 132 insertions(+), 132 deletions(-) diff --git a/.spellcheck_ignore.txt b/.spellcheck_ignore.txt index 80a4ca9..d40f215 100644 --- a/.spellcheck_ignore.txt +++ b/.spellcheck_ignore.txt @@ -51,9 +51,9 @@ js JSON keyring lexbor -Lexbor +`lexbor` LEXBOR -Lexbor's +`lexbor`'s lexbor's li lifecycle diff --git a/source/articles/part-1-html.md b/source/articles/part-1-html.md index 4f9e081..f7498d6 100644 --- a/source/articles/part-1-html.md +++ b/source/articles/part-1-html.md @@ -413,7 +413,7 @@ affect the tokenizer. These dependencies are largely due to namespaces. ## How to Solve Issues? -I will outline an HTML parser implementation for my Lexbor project, along with +I will outline an HTML parser implementation for my `lexbor` project, along with solutions to the problems discussed. ### Preprocessing @@ -674,7 +674,7 @@ tree_build_in_body_character(token) { } ``` -In Lexbor HTML: +In `lexbor` HTML: ```c tree_build_in_body_character(token) { lexbor_str_t str = {0}; @@ -748,7 +748,7 @@ move forward. Next, I will focus on CSS parsing and developing a custom grammar ## Sources The approach to parsing and HTML tree construction described here is implemented -in my [Lexbor](https://github.com/lexbor/lexbor) HTML library. +in my [`lexbor`](https://github.com/lexbor/lexbor) HTML library. ## P.S. diff --git a/source/examples/css/StyleSheet.md b/source/examples/css/StyleSheet.md index 36a5728..c859da7 100644 --- a/source/examples/css/StyleSheet.md +++ b/source/examples/css/StyleSheet.md @@ -2,7 +2,7 @@ This article explains the example code within the file [lexbor/css/StyleSheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/StyleSheet.c), -which demonstrates how to use the Lexbor library to read and parse a CSS +which demonstrates how to use the `lexbor` library to read and parse a CSS stylesheet. The code showcases the steps required to initialize the parser, read the CSS data from a file, parse the stylesheet, and serialize the resulting object. @@ -128,7 +128,7 @@ The program concludes successfully by returning `EXIT_SUCCESS`. ## Summary In this example, a CSS file is read, parsed, and its contents serialized using -the Lexbor library. Each significant section of the code has been explained to +the `lexbor` library. Each significant section of the code has been explained to provide clarity on the parsing process and resource management. By following these steps, developers can incorporate CSS parsing capabilities into their -applications using Lexbor. \ No newline at end of file +applications using `lexbor`. \ No newline at end of file diff --git a/source/examples/css/selectors/list_easy_way.md b/source/examples/css/selectors/list_easy_way.md index 4fa238d..fe38e23 100644 --- a/source/examples/css/selectors/list_easy_way.md +++ b/source/examples/css/selectors/list_easy_way.md @@ -1,7 +1,7 @@ # CSS Selector Parsing Example This article provides an in-depth explanation of the code found in -`list_easy_way.c`, which demonstrates how to use the lexbor library for parsing +`list_easy_way.c`, which demonstrates how to use the `lexbor` library for parsing CSS selectors. The code illustrates the steps involved in initializing a parser, parsing a CSS selector string, and handling the results and logs. diff --git a/source/examples/css/selectors/list_fast_way.md b/source/examples/css/selectors/list_fast_way.md index b4ec47b..9eba765 100644 --- a/source/examples/css/selectors/list_fast_way.md +++ b/source/examples/css/selectors/list_fast_way.md @@ -75,7 +75,7 @@ list, it is noted accordingly. ### Conclusion The `list_fast_way.c` example serves as a practical guide for developers looking -to understand how to parse CSS selectors using the lexbor library. By +to understand how to parse CSS selectors using the `lexbor` library. By emphasizing memory management, proper initialization, and error handling, this example lays a solid foundation for further applications of the library in real-world projects. The code harnesses the flexibility of lexbor while diff --git a/source/examples/css/syntax/structure_parse_file.md b/source/examples/css/syntax/structure_parse_file.md index c24d38a..f564401 100644 --- a/source/examples/css/syntax/structure_parse_file.md +++ b/source/examples/css/syntax/structure_parse_file.md @@ -2,14 +2,14 @@ This article provides an overview of the code located in [lexbor/css/syntax/structure_parse_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/structure_parse_file.c), -which implements a CSS syntax parser using the lexbor library. The primary goal +which implements a CSS syntax parser using the `lexbor` library. The primary goal of this code is to parse CSS syntax rules and declarations, handling various states and transitions within the parsing process. ## Code Overview The code starts with the inclusion of headers that bring in necessary -definitions and functions from the lexbor library. It defines multiple functions +definitions and functions from the `lexbor` library. It defines multiple functions and callback structures that manage the parsing of different CSS constructs. Central to the code is the `main` function, which serves as the entry point of the application. @@ -85,5 +85,5 @@ through the parsing state. The code contained in `structure_parse_file.c` offers a comprehensive implementation of a CSS syntax parser with well-defined states and callbacks. The use of systematic error handling and resource management provides stability -to the parsing process. By integrating these components, the lexbor library +to the parsing process. By integrating these components, the `lexbor` library enhances its ability to interpret and manipulate CSS effectively. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/chunks_stdin.md b/source/examples/css/syntax/tokenizer/chunks_stdin.md index 5f8cd80..91d140c 100644 --- a/source/examples/css/syntax/tokenizer/chunks_stdin.md +++ b/source/examples/css/syntax/tokenizer/chunks_stdin.md @@ -8,7 +8,7 @@ output the identified token types along with their serialized representations. ## Overview The main purpose of this example is to showcase the mechanics of the -`lxb_css_syntax_tokenizer`, a component provided by the Lexbor library for +`lxb_css_syntax_tokenizer`, a component provided by the `lexbor` library for parsing CSS syntax. The example leverages standard input (stdin) to read CSS input, processes the tokens through the tokenizer, and outputs details about each token to the console. @@ -144,6 +144,6 @@ avoid memory leaks. ## Conclusion This example illustrates how to implement a simple CSS syntax tokenizer using -the Lexbor library, allowing for parsing CSS input from stdin and outputting +the `lexbor` library, allowing for parsing CSS input from stdin and outputting token information. Anyone looking to understand or extend CSS parsing functionality can use this code as a foundation for further development. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/from_file.md b/source/examples/css/syntax/tokenizer/from_file.md index 6140a65..74492cc 100644 --- a/source/examples/css/syntax/tokenizer/from_file.md +++ b/source/examples/css/syntax/tokenizer/from_file.md @@ -11,7 +11,7 @@ extract tokens, and producing output that describes each token. The main function of the tokenizer is to parse CSS code from a file, generate tokens for syntactic analysis, and then invoke a callback function to handle the output of each token. The program efficiently handles input and organizes the -parsing process with the help of the lexbor library. +parsing process with the help of the `lexbor` library. ## Code Breakdown @@ -153,7 +153,7 @@ Finally, the program returns `EXIT_SUCCESS` if the execution was successful, or ## Conclusion The CSS syntax tokenizer effectively reads and parses a CSS file, extracting and -displaying token details by utilizing the lexbor library's API for CSS +displaying token details by utilizing the `lexbor` library's API for CSS processing. This example demonstrates not only the functionality of lexer-based parsing but also highlights memory management and error handling within a complex system. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/print_raw.md b/source/examples/css/syntax/tokenizer/print_raw.md index f0e9799..ebcc272 100644 --- a/source/examples/css/syntax/tokenizer/print_raw.md +++ b/source/examples/css/syntax/tokenizer/print_raw.md @@ -1,7 +1,7 @@ # CSS Syntax Tokenizer Example This article provides an overview of the `print_raw.c` source file, which -implements a simple command-line tool for tokenizing CSS syntax using the Lexbor +implements a simple command-line tool for tokenizing CSS syntax using the `lexbor` library. The primary purpose of this code is to read a CSS file, tokenize its contents, and print the tokens to the standard output. @@ -123,8 +123,8 @@ returns `EXIT_FAILURE`. ## Conclusion -The `print_raw.c` implementation demonstrates how to leverage the Lexbor library +The `print_raw.c` implementation demonstrates how to leverage the `lexbor` library for CSS syntax tokenization. By following a structured approach, it effectively reads CSS content, processes it into tokens, and provides robust error handling. This example serves as a foundation for further exploration of CSS parsing and -analysis using Lexbor. \ No newline at end of file +analysis using `lexbor`. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decode.md b/source/examples/encoding/buffer/decode/decode.md index 286b0ea..d572ce6 100644 --- a/source/examples/encoding/buffer/decode/decode.md +++ b/source/examples/encoding/buffer/decode/decode.md @@ -3,7 +3,7 @@ In this article, we will explore a code example from the file [lexbor/encoding/buffer/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decode.c) that demonstrates how to decode a UTF-8 encoded string into code points using -the Lexbor library. This example specifically highlights the usage of Lexbor's +the `lexbor` library. This example specifically highlights the usage of `lexbor`'s encoding functionalities, providing insights into how to leverage these features for character decoding in C. @@ -69,7 +69,7 @@ The program concludes with a return statement indicating successful execution. ## Summary This example effectively illustrates how to decode a UTF-8 string into -individual code points using the Lexbor library. It emphasizes the +individual code points using the `lexbor` library. It emphasizes the initialization of the decoding context, error handling strategies, and the process of translating encoded UTF-8 data into usable character representations. Through careful management of buffers and decoding functions, developers can diff --git a/source/examples/encoding/buffer/decode/decoder.md b/source/examples/encoding/buffer/decode/decoder.md index 9ae8765..bfbf69b 100644 --- a/source/examples/encoding/buffer/decode/decoder.md +++ b/source/examples/encoding/buffer/decode/decoder.md @@ -1,7 +1,7 @@ # Unicode Decoder Example In this article, we will discuss a simple Unicode decoder implemented in C, -specifically within the context of the lexbor library. The code can be found in +specifically within the context of the `lexbor` library. The code can be found in the source file [lexbor/encoding/buffer/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decoder.c). This program is designed to take a specified character encoding from the command @@ -76,7 +76,7 @@ points are then printed if any exist in the output buffer. ## Conclusion -This `decoder.c` example illustrates the practical use of the lexbor library for +This `decoder.c` example illustrates the practical use of the `lexbor` library for handling various character encodings and converting them into a clear, usable form. By leveraging the available utility functions and error handling methods, the code provides a robust framework for decoding inputs in a specified diff --git a/source/examples/encoding/buffer/decode/validate.md b/source/examples/encoding/buffer/decode/validate.md index 7c46088..effd71b 100644 --- a/source/examples/encoding/buffer/decode/validate.md +++ b/source/examples/encoding/buffer/decode/validate.md @@ -1,7 +1,7 @@ # UTF-8 Decoding and Replacement Example This article will explain a C code example that demonstrates UTF-8 decoding and -the handling of invalid byte sequences using the lexbor library. The source file +the handling of invalid byte sequences using the `lexbor` library. The source file for the example is [lexbor/encoding/buffer/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/validate.c). @@ -15,7 +15,7 @@ points. This is accomplished utilizing the lexbor encoding API. ### Including Necessary Headers -At the start of the code, the relevant header file from the lexbor library is +At the start of the code, the relevant header file from the `lexbor` library is included: ```c @@ -132,7 +132,7 @@ hexadecimal representation. ## Conclusion -This example effectively showcases the use of the lexbor library for decoding +This example effectively showcases the use of the `lexbor` library for decoding UTF-8 strings while managing potentially invalid byte sequences. By initializing the decoder, setting up replacement strategies, and decoding the input string, the program demonstrates a robust method for handling encoding issues in C. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encode.md b/source/examples/encoding/buffer/encode/encode.md index cbf9bbc..1d41f59 100644 --- a/source/examples/encoding/buffer/encode/encode.md +++ b/source/examples/encoding/buffer/encode/encode.md @@ -1,7 +1,7 @@ # Encoding Unicode Code Points to UTF-8 Example This article explains the encoding of Unicode code points to a UTF-8 byte string -using the Lexbor library. The source code is located in +using the `lexbor` library. The source code is located in [lexbor/encoding/buffer/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/encode.c). This example demonstrates how to initialize the encoder, encode Unicode code points, and handle the output appropriately. @@ -116,7 +116,7 @@ Unicode values shown in hexadecimal format. ## Conclusion -This code example effectively demonstrates the usage of the Lexbor encoding +This code example effectively demonstrates the usage of the `lexbor` encoding library for converting Unicode code points to a UTF-8 encoded string. It emphasizes proper initialization, error handling, and output formatting, which are essential for working with character encoding in C programming. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encoder.md b/source/examples/encoding/buffer/encode/encoder.md index 98c42c1..f2d92e3 100644 --- a/source/examples/encoding/buffer/encode/encoder.md +++ b/source/examples/encoding/buffer/encode/encoder.md @@ -157,7 +157,7 @@ written out before the program exits. ## Conclusion The `encoder.c` file is a functional implementation of an encoding utility using -the lexbor library. It effectively handles various character encodings, +the `lexbor` library. It effectively handles various character encodings, processes input data in a loop, and provides useful output, making it a useful tool for developers working with different text encodings. The awareness of error handling and usage guidance further enhances its usability in command-line diff --git a/source/examples/encoding/buffer/encode/validate.md b/source/examples/encoding/buffer/encode/validate.md index 66cbb36..5edee1c 100644 --- a/source/examples/encoding/buffer/encode/validate.md +++ b/source/examples/encoding/buffer/encode/validate.md @@ -4,7 +4,7 @@ This article explains the functionality of a Unicode encoding example, which can be found in the source file [lexbor/encoding/buffer/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/validate.c). The code serves as an illustration of how to encode Unicode code points into a -UTF-8 byte string using the Lexbor library. +UTF-8 byte string using the `lexbor` library. ## Overview @@ -19,7 +19,7 @@ and configuring it with replacement bytes for invalid code points. The code begins by including necessary header files, specifically `string.h` for string manipulation and `lexbor/encoding/encoding.h` for encoding functions from -the Lexbor library. A macro named `FAILED` is defined for error handling, which +the `lexbor` library. A macro named `FAILED` is defined for error handling, which simplifies reporting errors by outputting a message to `stderr` and exiting the program with a failure status. @@ -75,7 +75,7 @@ points. ## Conclusion -This example showcases how to utilize the Lexbor library to encode Unicode code +This example showcases how to utilize the `lexbor` library to encode Unicode code points into a UTF-8 byte string while implementing error handling and customization through replacement bytes for invalid code points. By following the steps outlined, developers can efficiently manage Unicode data in their diff --git a/source/examples/encoding/buffer/from_to.md b/source/examples/encoding/buffer/from_to.md index a3c35ee..aa49384 100644 --- a/source/examples/encoding/buffer/from_to.md +++ b/source/examples/encoding/buffer/from_to.md @@ -79,7 +79,7 @@ operations, and handles errors appropriately. ## Conclusion The `from_to` example illustrates how to adeptly handle encoding conversions in -C using the lexbor library. By providing a structured way to manage different +C using the `lexbor` library. By providing a structured way to manage different encodings and offering clear error handling, this example serves as a foundational component in the development of applications that require text data manipulation across various encodings. The modular approach allows enhancements diff --git a/source/examples/encoding/data_by_name.md b/source/examples/encoding/data_by_name.md index 8f308cf..5ff656b 100644 --- a/source/examples/encoding/data_by_name.md +++ b/source/examples/encoding/data_by_name.md @@ -3,14 +3,14 @@ This article provides an explanation of an example from the file [lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c). The purpose of this code is to demonstrate how to retrieve encoding data by its -name using the Lexbor encoding library. The code illustrated here highlights the +name using the `lexbor` encoding library. The code illustrated here highlights the procedure for accessing character encoding information, specifically focusing on UTF-8. ## Code Explanation The program starts with the necessary `#include` directive, which includes the -Lexbor encoding library header file. This library provides the functionality +`lexbor` encoding library header file. This library provides the functionality needed to work with different character encodings. ### Main Function @@ -35,7 +35,7 @@ enc_data = lxb_encoding_data_by_name((lxb_char_t *) "uTf-8", 5); ``` In this segment, the variable `enc_data` is declared as a pointer to -`lxb_encoding_data_t`, which represents the encoding data structure in Lexbor. +`lxb_encoding_data_t`, which represents the encoding data structure in `lexbor`. The function `lxb_encoding_data_by_name` is called with two arguments: the string "uTf-8" (with a deliberate mixed case) and the length of the string, which is `5`. @@ -86,7 +86,7 @@ program has run without any issues. The example presented in [lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c) -effectively demonstrates how to access encoding data using the Lexbor encoding +effectively demonstrates how to access encoding data using the `lexbor` encoding library. It showcases the importance of error handling and provides a simple way to retrieve and display the name of a character encoding, using UTF-8 as a practical example. This code can serve as a foundational component for diff --git a/source/examples/encoding/single/decode/decode.md b/source/examples/encoding/single/decode/decode.md index a57405c..20cff2b 100644 --- a/source/examples/encoding/single/decode/decode.md +++ b/source/examples/encoding/single/decode/decode.md @@ -3,7 +3,7 @@ This article explains a code example from [lexbor/encoding/single/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decode.c), which demonstrates how to decode a UTF-8 string into its respective code points -using the lexbor library. +using the `lexbor` library. ## Introduction @@ -140,6 +140,6 @@ substring) and its corresponding Unicode code point in hexadecimal format. ## Conclusion The example demonstrates a straightforward approach to decoding a UTF-8 string -into Unicode code points using the lexbor library. It effectively showcases +into Unicode code points using the `lexbor` library. It effectively showcases initialization, error handling, and character decoding, providing a practical illustration of working with character encodings in C. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/decoder.md b/source/examples/encoding/single/decode/decoder.md index 1e1e42b..1d9b108 100644 --- a/source/examples/encoding/single/decode/decoder.md +++ b/source/examples/encoding/single/decode/decoder.md @@ -10,12 +10,12 @@ needing to handle various text encodings in their applications. The main function of this code is to read data from standard input, decode it according to the specified encoding, and print the corresponding Unicode values. -It uses the Lexbor library to facilitate this process. +It uses the `lexbor` library to facilitate this process. ### Header and Includes At the beginning of the file, we find the licensing information and the -inclusion of the Lexbor encoding header: +inclusion of the `lexbor` encoding header: ```c #include @@ -164,7 +164,7 @@ if (cp == LXB_ENCODING_DECODE_CONTINUE) { ### Conclusion -By effectively using the Lexbor library's encoding functionalities, this code +By effectively using the `lexbor` library's encoding functionalities, this code provides a flexible and powerful example of how to decode various text encodings from standard input. Developers can adapt this example for their applications, thereby enhancing their ability to handle encoded text data efficiently. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/validate.md b/source/examples/encoding/single/decode/validate.md index 62800de..bb63386 100644 --- a/source/examples/encoding/single/decode/validate.md +++ b/source/examples/encoding/single/decode/validate.md @@ -1,7 +1,7 @@ # UTF-8 Decoding and Validation Example This article explains an example of decoding and validating a UTF-8 string, -using the Lexbor library. The source file for this code example is +using the `lexbor` library. The source file for this code example is [lexbor/encoding/single/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/validate.c). The primary objective of this code is to demonstrate how to properly decode a UTF-8 encoded string, handle decoding errors, and output both valid code points @@ -10,7 +10,7 @@ and error information for invalid byte sequences. ## Code Breakdown The example begins with necessary includes and macro definitions. It imports the -required header file for Lexbor encoding and defines a macro `FAILED` that +required header file for `lexbor` encoding and defines a macro `FAILED` that handles error reporting and terminates the program if an error occurs. ### Setting Up the Main Function @@ -86,7 +86,7 @@ if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { The program concludes by returning a success status if all decoding operations complete without errors. In summary, this code serves as an illustrative example -of how to utilize the Lexbor encoding library to decode and validate UTF-8 +of how to utilize the `lexbor` encoding library to decode and validate UTF-8 encoded strings effectively, while properly handling potential errors in byte sequences. By implementing this method, developers can ensure their applications correctly interpret and display UTF-8 content. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encode.md b/source/examples/encoding/single/encode/encode.md index 968540d..aa9c1d2 100644 --- a/source/examples/encoding/single/encode/encode.md +++ b/source/examples/encoding/single/encode/encode.md @@ -4,11 +4,11 @@ This article explains the purpose and functionality of the UTF-8 encoding example provided in the file [lexbor/encoding/single/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/encode.c). The code demonstrates how to encode a series of Unicode code points into a UTF-8 -byte string using the Lexbor encoding library. +byte string using the `lexbor` encoding library. ## Code Overview -The program begins by including the necessary header file for the Lexbor +The program begins by including the necessary header file for the `lexbor` encoding library. It defines a macro for error handling named `FAILED`, which simplifies printing error messages and terminating the program if initialization or execution fails. @@ -51,6 +51,6 @@ encoder instance. The following key steps are involved in the encoding process: ## Conclusion Upon reaching the end of the program, it exits gracefully, indicating successful -execution. This example illustrates how to use the Lexbor encoding library for +execution. This example illustrates how to use the `lexbor` encoding library for converting Unicode code points to a UTF-8 encoded string, providing a clear and -practical implementation of encoding functionality in C using Lexbor. \ No newline at end of file +practical implementation of encoding functionality in C using `lexbor`. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/validate.md b/source/examples/encoding/single/encode/validate.md index e2beeaf..93cd26d 100644 --- a/source/examples/encoding/single/encode/validate.md +++ b/source/examples/encoding/single/encode/validate.md @@ -1,7 +1,7 @@ # Encoding Unicode Code Points to UTF-8 Example This example demonstrates how to validate and encode Unicode code points into a -UTF-8 byte string using the lexbor library. The functionality is encapsulated +UTF-8 byte string using the `lexbor` library. The functionality is encapsulated within a C program located in the [lexbor/encoding/single/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/validate.c) file. The purpose of this code is to illustrate the encoding of a set of given @@ -10,7 +10,7 @@ with a predefined replacement character. ## Overview of the Code -The code begins by including the necessary header files from the lexbor library, +The code begins by including the necessary header files from the `lexbor` library, specifically targeting encoding functionality. It subsequently defines a macro for error handling, which outputs an error message to `stderr` and exits the program with a failure status. @@ -71,5 +71,5 @@ the last byte of the buffer to `0x00`. It then prints the final UTF-8 result. The program effectively showcases how to handle Unicode encoding with proper error management for invalid inputs. This example is particularly useful for -developers using the lexbor library to manage character encodings, providing +developers using the `lexbor` library to manage character encodings, providing insight on validating and encoding procedures in C. \ No newline at end of file diff --git a/source/examples/encoding/single/from_to.md b/source/examples/encoding/single/from_to.md index 7339678..c98b585 100644 --- a/source/examples/encoding/single/from_to.md +++ b/source/examples/encoding/single/from_to.md @@ -4,7 +4,7 @@ This article explains the encoding conversion functionality provided in the source file [lexbor/encoding/single/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/from_to.c). The code allows users to convert text from one character encoding to another via -command-line input. It demonstrates how to utilize the Lexbor encoding library +command-line input. It demonstrates how to utilize the `lexbor` encoding library for encoding and decoding different formats of character sets. ## Overview @@ -19,7 +19,7 @@ before writing the output to standard output. ### Definitions and Includes -At the beginning of the file, we include the necessary header for the Lexbor +At the beginning of the file, we include the necessary header for the `lexbor` encoding module: ```c @@ -125,7 +125,7 @@ resources are cleaned up properly before the program exits. ## Conclusion The `from_to.c` example illustrates a practical approach to character encoding -conversion using the Lexbor encoding library. It showcases error handling, user +conversion using the `lexbor` encoding library. It showcases error handling, user guidance, and processing loops, making it a valuable reference for developers needing to handle various text encodings in their applications. This example emphasizes the importance of robust input handling and clean output generation diff --git a/source/examples/html/document_parse.md b/source/examples/html/document_parse.md index 5326468..398107f 100644 --- a/source/examples/html/document_parse.md +++ b/source/examples/html/document_parse.md @@ -1,6 +1,6 @@ # HTML Document Parsing Example -This article explains an example of parsing an HTML document using the Lexbor +This article explains an example of parsing an HTML document using the `lexbor` library. The purpose of this example, located in the source file [lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c), is to illustrate the steps necessary to create an HTML document, parse a string @@ -110,6 +110,6 @@ maintain system performance and stability. The example provided in [lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c) serves as a clear demonstration of how to create, parse, and handle an HTML -document using Lexbor. Through careful initialization, parsing, result +document using `lexbor`. Through careful initialization, parsing, result outputting, and cleanup, this code illustrates best practices for managing HTML documents in a C environment. \ No newline at end of file diff --git a/source/examples/html/document_parse_chunk.md b/source/examples/html/document_parse_chunk.md index b5fc7a9..7dc7f92 100644 --- a/source/examples/html/document_parse_chunk.md +++ b/source/examples/html/document_parse_chunk.md @@ -1,7 +1,7 @@ # HTML Document Parsing Example This article provides an overview of an example implementation of HTML document -parsing using the Lexbor library. The example is located in the source file +parsing using the `lexbor` library. The example is located in the source file [lexbor/html/document_parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse_chunk.c). This example demonstrates how to create an HTML document, parse it in chunks, and handle the cleaning up of allocated resources. @@ -116,7 +116,7 @@ lxb_html_document_destroy(document); ## Conclusion -This example effectively illustrates how to use Lexbor for HTML document parsing +This example effectively illustrates how to use `lexbor` for HTML document parsing in a chunked manner. The structure and logic of the code provide a solid foundation for more advanced HTML processing applications. It encapsulates essential operations such as initialization, incremental parsing, result diff --git a/source/examples/html/document_title.md b/source/examples/html/document_title.md index 411e1a1..0c72fbf 100644 --- a/source/examples/html/document_title.md +++ b/source/examples/html/document_title.md @@ -5,7 +5,7 @@ implemented in the source code found in [lexbor/html/document_title.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_title.c). The purpose of this code is to demonstrate how to parse an HTML string, retrieve its title, modify the title, and then display the resulting HTML document -structure using the Lexbor library. +structure using the `lexbor` library. ## Code Breakdown @@ -113,9 +113,9 @@ lxb_html_document_destroy(document); ## Conclusion This example illustrates the basic operations for handling HTML document titles -using the Lexbor library, including parsing content, accessing and modifying the +using the `lexbor` library, including parsing content, accessing and modifying the title, and ensuring proper resource management. The structure of the code is straightforward, aiming to provide a clear understanding of each step involved in managing an HTML document's title. As developers familiarize themselves with -the functionalities offered by Lexbor, they will be better equipped to +the functionalities offered by `lexbor`, they will be better equipped to manipulate HTML content programmatically. \ No newline at end of file diff --git a/source/examples/html/element_attributes.md b/source/examples/html/element_attributes.md index c8cb12c..b02d503 100644 --- a/source/examples/html/element_attributes.md +++ b/source/examples/html/element_attributes.md @@ -2,7 +2,7 @@ This article explains the implementation found in [lexbor/html/element_attributes.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_attributes.c), -which demonstrates how to manipulate HTML element attributes using the Lexbor +which demonstrates how to manipulate HTML element attributes using the `lexbor` library. The example outlines parsing an HTML snippet, finding an element, and performing various operations involving element attributes, such as adding, checking existence, retrieving, modifying, and removing attributes from an @@ -139,7 +139,7 @@ lxb_html_document_destroy(document); ## Conclusion The `element_attributes.c` example illustrates fundamental operations in DOM -manipulation provided by the Lexbor library. The code efficiently demonstrates +manipulation provided by the `lexbor` library. The code efficiently demonstrates how to parse HTML, locate and manipulate elements, manage attributes, and ensure appropriate cleanup of resources, making it a valuable reference for web -developers working with the Lexbor framework. \ No newline at end of file +developers working with the `lexbor` framework. \ No newline at end of file diff --git a/source/examples/html/element_create.md b/source/examples/html/element_create.md index 9de557f..cc641f8 100644 --- a/source/examples/html/element_create.md +++ b/source/examples/html/element_create.md @@ -1,7 +1,7 @@ # HTML Element Creation Example This article explains the implementation of creating and appending HTML elements -in a document using the respective Lexbor library. The example provided is from +in a document using the respective `lexbor` library. The example provided is from the source file [lexbor/html/element_create.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_create.c). @@ -26,7 +26,7 @@ elements, and preserving the overall tree structure through serialization. `lxb_html_document_body_element(document)`, allowing further manipulations to be performed on this node. -4. **Creating Elements**: A loop iterates over all tag IDs defined by the Lexbor +4. **Creating Elements**: A loop iterates over all tag IDs defined by the `lexbor` library, from `LXB_TAG_A` to `LXB_TAG__LAST_ENTRY`. For each tag: - The tag name is retrieved using `lxb_tag_name_by_id`. - An element is created with `lxb_dom_document_create_element`. This function @@ -49,7 +49,7 @@ elements, and preserving the overall tree structure through serialization. ## Conclusion This program effectively showcases the process of dynamically creating HTML -elements using the Lexbor library. It covers the aspects of parsing, element +elements using the `lexbor` library. It covers the aspects of parsing, element creation, manipulation, and serialization, providing an essential toolkit for developers looking to work with HTML structures programmatically. The inclusion of error handling ensures reliability, allowing developers to catch and address diff --git a/source/examples/html/element_innerHTML.md b/source/examples/html/element_innerHTML.md index 00a4c4f..3f3bf7f 100644 --- a/source/examples/html/element_innerHTML.md +++ b/source/examples/html/element_innerHTML.md @@ -1,7 +1,7 @@ # Setting innerHTML Example This article will explain the `innerHTML` manipulation in the context of the -Lexbor HTML parser, as illustrated in the source file +`lexbor` HTML parser, as illustrated in the source file [lexbor/html/element_innerHTML.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_innerHTML.c). This example demonstrates how to parse HTML content, modify an element's inner HTML, and serialize the result. @@ -9,7 +9,7 @@ HTML, and serialize the result. ## Code Overview The code starts with the inclusion of the necessary header file, `base.h`, which -likely contains the essential definitions and functions for the Lexbor library. +likely contains the essential definitions and functions for the `lexbor` library. The `main` function serves as the entry point for the execution of this program. ### HTML Parsing @@ -80,6 +80,6 @@ lxb_html_document_destroy(document); The example provided illustrates how to parse an HTML string, modify an element's inner HTML content, and serialize the resulting DOM structure using -Lexbor's capabilities. This demonstrates an essential functionality often used +`lexbor`'s capabilities. This demonstrates an essential functionality often used in web development for DOM manipulation, showcasing the ease of use of the -Lexbor library for such tasks. \ No newline at end of file +`lexbor` library for such tasks. \ No newline at end of file diff --git a/source/examples/html/elements_by_attr.md b/source/examples/html/elements_by_attr.md index b7300fe..4171242 100644 --- a/source/examples/html/elements_by_attr.md +++ b/source/examples/html/elements_by_attr.md @@ -2,7 +2,7 @@ This article will explain the functionality and implementation of the code found in **lexbor/html/elements_by_attr.c**, which demonstrates how to retrieve DOM -elements based on specific attributes using the lexbor library. +elements based on specific attributes using the `lexbor` library. ## Overview @@ -142,7 +142,7 @@ This is critical for maintaining memory hygiene in C programs. ## Conclusion This code snippet demonstrates how to efficiently query and manipulate DOM -elements in an HTML document using the lexbor library. By utilizing various +elements in an HTML document using the `lexbor` library. By utilizing various search strategies based on attributes, developers can effectively streamline their DOM interactions, showcasing the flexibility and power of the lexbor library for handling HTML content. \ No newline at end of file diff --git a/source/examples/html/elements_by_class_name.md b/source/examples/html/elements_by_class_name.md index 47d3e0c..ff3b01f 100644 --- a/source/examples/html/elements_by_class_name.md +++ b/source/examples/html/elements_by_class_name.md @@ -4,7 +4,7 @@ In this article, we will explore the implementation details and functionality of the `elements_by_class_name` example, found in the [lexbor/html/elements_by_class_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_class_name.c) source file. The code demonstrates how to parse an HTML string and retrieve -elements with a specific class name using the lexbor library. This example is +elements with a specific class name using the `lexbor` library. This example is essential for developers seeking to manipulate and query DOM elements in a structured manner. @@ -99,7 +99,7 @@ lxb_html_document_destroy(document); ## Conclusion -The `elements_by_class_name` example illustrates how to use the lexbor library +The `elements_by_class_name` example illustrates how to use the `lexbor` library to parse HTML content, search for elements by class name, and efficiently manage those elements. The critical sections of the code demonstrate proper document handling, error management, and systematic cleanup, providing a solid foundation diff --git a/source/examples/html/elements_by_tag_name.md b/source/examples/html/elements_by_tag_name.md index e7c9be5..259b679 100644 --- a/source/examples/html/elements_by_tag_name.md +++ b/source/examples/html/elements_by_tag_name.md @@ -3,13 +3,13 @@ This article will explain the code found in the source file [lexbor/html/elements_by_tag_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_tag_name.c), which demonstrates how to find and print HTML elements by their tag names using -the Lexbor DOM library. +the `lexbor` DOM library. ## Code Overview The purpose of this example is to parse a simple HTML string and retrieve all `
` elements from the parsed document. It achieves this by leveraging the -Lexbor library's DOM capabilities to manage and manipulate the HTML document +`lexbor` library's DOM capabilities to manage and manipulate the HTML document structure. ## Main Function @@ -131,8 +131,8 @@ complete. ## Conclusion -This example serves as a practical demonstration of how to use the Lexbor +This example serves as a practical demonstration of how to use the `lexbor` library to parse HTML and find elements by tag name. By using functions from the library's API, the code effectively processes a document and manages collections -of elements, showcasing the utility of the Lexbor framework in web development +of elements, showcasing the utility of the `lexbor` framework in web development tasks. \ No newline at end of file diff --git a/source/examples/html/encoding.md b/source/examples/html/encoding.md index eb5ea92..645619f 100644 --- a/source/examples/html/encoding.md +++ b/source/examples/html/encoding.md @@ -4,13 +4,13 @@ This article provides an explanation for the HTML Encoding example found in the file [lexbor/html/encoding.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/encoding.c). This program is designed to read an HTML file, determine its character encoding, -and print it out. The implementation utilizes the Lexbor library, which offers +and print it out. The implementation utilizes the `lexbor` library, which offers various functions to handle encoding. ## Overview The main function of the example handles command-line input, reads an HTML file, -and determines its encoding using the Lexbor library. The code includes a +and determines its encoding using the `lexbor` library. The code includes a failure handling mechanism and a usage function to guide users on how to execute the program properly. @@ -79,6 +79,6 @@ ensure proper resource management. The HTML Encoding example demonstrates essential practices such as error handling, memory management, and the use of a library to enhance functionality. -By following this example, developers can understand how to utilize the Lexbor +By following this example, developers can understand how to utilize the `lexbor` library for encoding detection in HTML documents, while also adhering to proper coding standards for readability and maintainability. \ No newline at end of file diff --git a/source/examples/html/html2sexpr.md b/source/examples/html/html2sexpr.md index 0cf040a..c2cbba6 100644 --- a/source/examples/html/html2sexpr.md +++ b/source/examples/html/html2sexpr.md @@ -3,14 +3,14 @@ This article provides an overview of a code example found in the file [lexbor/html/html2sexpr.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/html2sexpr.c). The program is designed to convert an HTML tag tree into an S-expression string -and output it to standard output. The program utilizes the Lexbor library to +and output it to standard output. The program utilizes the `lexbor` library to handle parsing and manipulating HTML documents. ## Overview The program first checks if the correct number of command-line arguments is provided. It expects one argument: the path to an HTML file. It reads the -contents of this file and initializes an HTML document object using Lexbor's +contents of this file and initializes an HTML document object using `lexbor`'s API. After parsing the HTML, the program invokes a tree-walking function to serialize the HTML structure into an S-expression format. The serialized output is then printed to the console. @@ -114,8 +114,8 @@ ensuring resources are released before terminating. ## Conclusion This example demonstrates a straightforward implementation of converting an HTML -document structure into S-expressions using the Lexbor library. The program is +document structure into S-expressions using the `lexbor` library. The program is structured to handle input validation, document parsing, tree traversal, and serialization efficiently while providing clear feedback in the case of errors. -It showcases the use of Lexbor's DOM manipulation capabilities and highlights +It showcases the use of `lexbor`'s DOM manipulation capabilities and highlights how to build a recursive tree-walking algorithm for tree serialization. \ No newline at end of file diff --git a/source/examples/html/parse.md b/source/examples/html/parse.md index d88f0a0..f62f9d5 100644 --- a/source/examples/html/parse.md +++ b/source/examples/html/parse.md @@ -1,6 +1,6 @@ # HTML Parsing and Serialization Example -This example demonstrates how to create an HTML parser using the lexbor library, +This example demonstrates how to create an HTML parser using the `lexbor` library, parse simple HTML strings into document objects, and serialize those documents back to a readable format. The code is found in the source file [lexbor/html/parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse.c). diff --git a/source/examples/html/parse_chunk.md b/source/examples/html/parse_chunk.md index cd848cc..68390a0 100644 --- a/source/examples/html/parse_chunk.md +++ b/source/examples/html/parse_chunk.md @@ -3,7 +3,7 @@ This article provides an overview of the HTML chunk parsing example implemented in the source file [lexbor/html/parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse_chunk.c). -The example demonstrates how to utilize the Lexbor HTML parsing library to +The example demonstrates how to utilize the `lexbor` HTML parsing library to handle HTML data in incremental chunks. By breaking the input into smaller pieces, it showcases the parser's versatility and ability to manage partial data streams effectively. @@ -11,7 +11,7 @@ streams effectively. ## Code Overview The main function serves as the entry point for the program. Here, several -significant components of the Lexbor library are employed, such as creating a +significant components of the `lexbor` library are employed, such as creating a parser, managing HTML documents, and serializing the parsed content. ### Initialization @@ -101,7 +101,7 @@ destroyed, freeing resources that are no longer needed. The example provided in [lexbor/html/parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/parse_chunk.c) is a straightforward illustration of how to parse HTML data incrementally with -the Lexbor library. By breaking the input into manageable chunks, the parser can +the `lexbor` library. By breaking the input into manageable chunks, the parser can efficiently handle larger HTML documents and offers developers flexibility when processing dynamic or streamed data. This method is particularly useful in web environments where HTML content may not always be available as a single, diff --git a/source/examples/html/tokenizer/callback.md b/source/examples/html/tokenizer/callback.md index 12393fc..5f26acc 100644 --- a/source/examples/html/tokenizer/callback.md +++ b/source/examples/html/tokenizer/callback.md @@ -68,5 +68,5 @@ The `main` function contains several key operations: This example illustrates the use of a callback function within a tokenizer to process HTML tokens sequentially. By gracefully handling errors and providing hooks for further processing, the code affords flexibility and clarity in -parsing HTML inputs using the lexbor library. It exemplifies best practices in +parsing HTML inputs using the `lexbor` library. It exemplifies best practices in resource management, modular function design, and effective error handling in C. \ No newline at end of file diff --git a/source/examples/html/tokenizer/simple.md b/source/examples/html/tokenizer/simple.md index de96c1f..67794e7 100644 --- a/source/examples/html/tokenizer/simple.md +++ b/source/examples/html/tokenizer/simple.md @@ -1,7 +1,7 @@ # HTML Tokenizer Example This article provides a detailed explanation of an HTML tokenizer example -implemented in C, demonstrating the capabilities of the lexbor library through +implemented in C, demonstrating the capabilities of the `lexbor` library through the file [lexbor/html/tokenizer/simple.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/simple.c). This code is intended to parse a simple HTML string and display the tokens diff --git a/source/examples/html/tokenizer/text.md b/source/examples/html/tokenizer/text.md index 6498210..19e3069 100644 --- a/source/examples/html/tokenizer/text.md +++ b/source/examples/html/tokenizer/text.md @@ -3,14 +3,14 @@ This article describes the functionality of the example code provided in the file [lexbor/html/tokenizer/text.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/tokenizer/text.c). -The code implements an HTML tokenizer using the Lexbor library, focusing on +The code implements an HTML tokenizer using the `lexbor` library, focusing on extracting and printing text tokens from HTML input. ## Overview of the Code The main thrust of this code is to parse HTML data, identify text tokens within it, and print those tokens to the standard output. The code utilizes functions -provided by the Lexbor library, a lightweight and efficient HTML and XML +provided by the `lexbor` library, a lightweight and efficient HTML and XML processing library. ## Key Sections of the Code @@ -19,7 +19,7 @@ processing library. The code begins with the inclusion of the `lexbor/html/tokenizer.h` header file, which contains the necessary declarations for using the tokenizer functionality -of the Lexbor library. Following this, a macro named `FAILED` is defined. This +of the `lexbor` library. Following this, a macro named `FAILED` is defined. This macro can be used throughout the code to simplify error handling: ```c @@ -116,7 +116,7 @@ process. ## Conclusion -This example provides a clear illustration of how to utilize the Lexbor library +This example provides a clear illustration of how to utilize the `lexbor` library to parse HTML and process text tokens. By focusing on text tokens, and employing proper error handling mechanics, the code demonstrates a concise yet effective approach to basic HTML tokenization. \ No newline at end of file diff --git a/source/examples/punycode/decode.md b/source/examples/punycode/decode.md index c606294..1ab4e34 100644 --- a/source/examples/punycode/decode.md +++ b/source/examples/punycode/decode.md @@ -10,7 +10,7 @@ domain names (IDNs). ## Overview The core function of this program reads input from standard input, decodes it -using the Lexbor library's Punycode functionality, and outputs the decoded +using the `lexbor` library's Punycode functionality, and outputs the decoded string to standard output. Below, we detail the main components of the code, their functionality, and the logic behind the operations. @@ -100,7 +100,7 @@ data is displayed. ## Conclusion -This example demonstrates how to utilize the Lexbor library for Punycode +This example demonstrates how to utilize the `lexbor` library for Punycode decoding in C. The program handles memory management, input reading, and decoding efficiently while ensuring robustness against common issues like memory allocation failures. Through this utility, developers can work with diff --git a/source/examples/punycode/encode.md b/source/examples/punycode/encode.md index ce86dc6..ee5dc37 100644 --- a/source/examples/punycode/encode.md +++ b/source/examples/punycode/encode.md @@ -3,7 +3,7 @@ This article discusses the code example found in the file [lexbor/punycode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/punycode/encode.c), which demonstrates how to encode a string using the Punycode algorithm with the -lexbor library. Punycode is a way to represent Internationalized Domain Names +`lexbor` library. Punycode is a way to represent Internationalized Domain Names (IDNs) using only ASCII characters. This code facilitates reading input data, manages memory allocation dynamically, and encodes the input using a callback function to handle the output. @@ -121,5 +121,5 @@ example, illustrating how to implement Punycode encoding in C. The example highlights important practices such as dynamic memory management, error handling, and the use of callback functions, which are all vital when dealing with input and output in systems programming. By following this structured -approach, developers can efficiently utilize the lexbor library to handle +approach, developers can efficiently utilize the `lexbor` library to handle Internationalized Domain Names. \ No newline at end of file diff --git a/source/examples/selectors/easy_way.md b/source/examples/selectors/easy_way.md index f8f17d8..b1c4118 100644 --- a/source/examples/selectors/easy_way.md +++ b/source/examples/selectors/easy_way.md @@ -2,13 +2,13 @@ This article explains an example program found in the file [lexbor/selectors/easy_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/selectors/easy_way.c), -which demonstrates how to use the Lexbor library to parse HTML and match it +which demonstrates how to use the `lexbor` library to parse HTML and match it against CSS selectors. The example involves creating an HTML document, defining CSS selectors, and then finding matching nodes in the document. ## Overview of the Code -The program begins with the inclusion of necessary headers from the Lexbor +The program begins with the inclusion of necessary headers from the `lexbor` library, specifically for handling HTML documents and CSS selectors. The primary functionalities are encapsulated in multiple functions, including the `callback` function, which prints matched nodes, and the `find_callback` function, which @@ -65,7 +65,7 @@ keeps track of the count of found nodes. ### Conclusion -This example demonstrates the effective use of the Lexbor library for +This example demonstrates the effective use of the `lexbor` library for manipulating and selecting elements within HTML documents based on CSS selectors. By understanding how to parse both HTML and CSS, and by using callback functions to manage matched nodes, developers can efficiently implement diff --git a/source/examples/selectors/normal_way.md b/source/examples/selectors/normal_way.md index 96454e5..a3c68d5 100644 --- a/source/examples/selectors/normal_way.md +++ b/source/examples/selectors/normal_way.md @@ -2,7 +2,7 @@ This example, found in the source file [lexbor/selectors/normal_way.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/selectors/normal_way.c), -demonstrates how to use the Lexbor library to parse CSS selectors and find HTML +demonstrates how to use the `lexbor` library to parse CSS selectors and find HTML nodes that match those selectors. The code provides a comprehensive workflow, from creating an HTML document to parsing selectors and retrieving matching nodes while handling memory management efficiently. @@ -11,7 +11,7 @@ nodes while handling memory management efficiently. The main function serves as the central processing unit of the code, orchestrating the various tasks. It initializes necessary structures, parses an -HTML string, sets up CSS selectors, and employs the Lexbor library's +HTML string, sets up CSS selectors, and employs the `lexbor` library's capabilities to find nodes in the document. ### HTML and CSS Data @@ -102,7 +102,7 @@ each created object, adhering to good practices in C coding. ## Conclusion In summary, this example outlines a practical implementation of HTML and CSS -handling using the Lexbor library. It emphasizes the importance of robust memory +handling using the `lexbor` library. It emphasizes the importance of robust memory management, selector parsing, and node finding functionalities, making it a -valuable reference for developers looking to understand or utilize Lexbor in +valuable reference for developers looking to understand or utilize `lexbor` in their projects. \ No newline at end of file diff --git a/source/examples/selectors/unique_nodes.md b/source/examples/selectors/unique_nodes.md index 88697d8..be222dd 100644 --- a/source/examples/selectors/unique_nodes.md +++ b/source/examples/selectors/unique_nodes.md @@ -2,7 +2,7 @@ This article discusses the functionality of the `unique_nodes.c` source file, which implements a basic example of parsing HTML and CSS selectors using the -lexbor library. The example illustrates how to create an HTML document, parse +`lexbor` library. The example illustrates how to create an HTML document, parse CSS selectors, and find nodes within the document that match those selectors. ## Key Components @@ -80,6 +80,6 @@ The `unique_nodes.c` example illustrates a practical application of the lexbor library to handle HTML documents and CSS selectors. By showcasing the entire lifecycle from parsing HTML to finding nodes based on CSS selectors, this example serves as an informative foundation for developers looking to work with -document structures and styles in C using the lexbor library. The implemented +document structures and styles in C using the `lexbor` library. The implemented logic emphasizes efficiency and clarity, ensuring that the handling of selectors and nodes is both effective and straightforward. \ No newline at end of file diff --git a/source/examples/styles/attribute_style.md b/source/examples/styles/attribute_style.md index 7fa23db..760f33e 100644 --- a/source/examples/styles/attribute_style.md +++ b/source/examples/styles/attribute_style.md @@ -66,7 +66,7 @@ during program execution. ## Conclusion This code example illustrates how to manipulate and retrieve CSS properties from -an HTML element using the lexbor library. It covers creating an HTML document, +an HTML element using the `lexbor` library. It covers creating an HTML document, parsing content, accessing specific elements, and outputting style properties, providing a comprehensive look at handling HTML and CSS in C with lexbor. The example highlights the importance of proper resource management and error diff --git a/source/examples/styles/events_insert.md b/source/examples/styles/events_insert.md index c436294..2978652 100644 --- a/source/examples/styles/events_insert.md +++ b/source/examples/styles/events_insert.md @@ -3,7 +3,7 @@ This article explains the C code found in [lexbor/styles/events_insert.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/events_insert.c), which demonstrates the process of manipulating HTML documents and applying CSS -styles using the Lexbor library. The code operates on a simple HTML structure +styles using the `lexbor` library. The code operates on a simple HTML structure and applies specific styles based on a CSS stylesheet. ## Overview @@ -16,7 +16,7 @@ insert a new HTML element. Here's a breakdown of the major sections of the code. ### Includes and Definitions -The code begins with the inclusion of necessary header files from the Lexbor +The code begins with the inclusion of necessary header files from the `lexbor` library, which are essential for HTML, CSS, and selector functionalities: ```c @@ -134,8 +134,8 @@ destroying collections, stylesheets, and the document itself. The code in [lexbor/styles/events_insert.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/events_insert.c) -illustrates an effective use of the Lexbor library to manipulate HTML and apply +illustrates an effective use of the `lexbor` library to manipulate HTML and apply CSS. By parsing, creating elements, setting attributes, and attaching styles, it provides a clear example of dynamic document editing and processing. This -showcases both the capabilities and convenience of the Lexbor framework in +showcases both the capabilities and convenience of the `lexbor` framework in handling web technologies programmatically. \ No newline at end of file diff --git a/source/examples/styles/stylesheet.md b/source/examples/styles/stylesheet.md index d303898..a1afddb 100644 --- a/source/examples/styles/stylesheet.md +++ b/source/examples/styles/stylesheet.md @@ -1,7 +1,7 @@ # CSS Stylesheet Parsing and Application Example In this article, we will explore the implementation of CSS stylesheet parsing -and application to HTML elements using the Lexbor library. The following example +and application to HTML elements using the `lexbor` library. The following example is derived from the source file [lexbor/styles/stylesheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/styles/stylesheet.c). The code illustrates how to create an HTML document, parse CSS styles, attach @@ -12,7 +12,7 @@ style declarations from an element. The core of the example revolves around creating a minimal HTML document that contains a `
` element with inline CSS styles. The code then initializes the -Lexbor HTML and CSS parsers, processes the provided CSS, and attaches the styles +`lexbor` HTML and CSS parsers, processes the provided CSS, and attaches the styles to the HTML document. Finally, it retrieves specific CSS properties (width and height) from the `
` element and serializes them for output. @@ -159,7 +159,7 @@ resources, ensuring there are no memory leaks: ## Conclusion The presented example demonstrates the process of parsing and applying CSS -styles to an HTML document using the Lexbor library. By following through each +styles to an HTML document using the `lexbor` library. By following through each part of the code, one can gain insights into how to effectively manage CSS properties within a structured HTML environment, allowing for flexible design and styling in modern web applications. \ No newline at end of file diff --git a/source/examples/styles/walk.md b/source/examples/styles/walk.md index 32b79b4..265be3e 100644 --- a/source/examples/styles/walk.md +++ b/source/examples/styles/walk.md @@ -5,7 +5,7 @@ This article explains the functionality and structure of the code found in The example focuses on parsing an HTML document, attaching CSS styles to an element, and traversing the applied styles. The primary goal of this example is to demonstrate how to manipulate the Document Object Model (DOM) and apply CSS -styling in the Lexbor library. +styling in the `lexbor` library. ## Overview of the Code @@ -16,7 +16,7 @@ elements. ### Include Directives and Function Prototypes -The code begins by including essential header files from the Lexbor library, +The code begins by including essential header files from the `lexbor` library, specifically for HTML and CSS functionalities. It defines two primary callback functions: @@ -78,7 +78,7 @@ memory leaks and ensuring efficient resource management. ## Conclusion This code example highlights the integration of HTML parsing and CSS styling -using the Lexbor library. By utilizing the provided functions and callback +using the `lexbor` library. By utilizing the provided functions and callback methods, developers can effectively manipulate and inspect styles associated with HTML elements. The careful arrangement of initialization, parsing, walking through styles, and resource cleanup demonstrates best practices in managing diff --git a/source/examples/unicode/normalization_form.md b/source/examples/unicode/normalization_form.md index b2841ca..19ae20e 100644 --- a/source/examples/unicode/normalization_form.md +++ b/source/examples/unicode/normalization_form.md @@ -2,7 +2,7 @@ This article explains the example code found in the file [lexbor/unicode/normalization_form.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/unicode/normalization_form.c). -The program demonstrates how to perform Unicode normalization using the Lexbor +The program demonstrates how to perform Unicode normalization using the `lexbor` library, specifically focusing on four normalization forms: NFC, NFD, NFKC, and NFKD. @@ -61,7 +61,7 @@ function processes the normalized output. The `callback` function accepts the normalized data, its length, and a context string (the name of the normalization form). Inside this function, the received -data is processed to decode valid UTF-8 sequences. It utilizes the Lexbor +data is processed to decode valid UTF-8 sequences. It utilizes the `lexbor` function `lxb_encoding_decode_valid_utf_8_single()` to decode each character code point and print it in hexadecimal format. @@ -82,5 +82,5 @@ returns a success status, indicating that all operations were completed without errors. This example provides a practical approach to understanding how Unicode -normalization works in the Lexbor library and demonstrates how to handle Unicode +normalization works in the `lexbor` library and demonstrates how to handle Unicode strings effectively. \ No newline at end of file diff --git a/source/examples/unicode/normalization_form_stdin.md b/source/examples/unicode/normalization_form_stdin.md index 89a1ba4..6abeb9b 100644 --- a/source/examples/unicode/normalization_form_stdin.md +++ b/source/examples/unicode/normalization_form_stdin.md @@ -52,7 +52,7 @@ potential null-termination issues gracefully. ## Conclusion This example illustrates how to implement a basic command line utility for -Unicode normalization using the lexbor library. It effectively demonstrates +Unicode normalization using the `lexbor` library. It effectively demonstrates handling input, processing data with a normalization algorithm, and producing output. This utility can be useful in applications where consistent Unicode representation is crucial, such as in text processing and data interchange diff --git a/source/examples/url/parse.md b/source/examples/url/parse.md index 834bd02..57e0697 100644 --- a/source/examples/url/parse.md +++ b/source/examples/url/parse.md @@ -2,7 +2,7 @@ This article examines a code example from the [lexbor/url/parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/url/parse.c) -file, focusing on URL parsing using the Lexbor library. The intent of this code +file, focusing on URL parsing using the `lexbor` library. The intent of this code is to demonstrate how to initialize the URL parser, parse a URL string, and subsequently serialize different components of the parsed URL, such as the scheme, username, password, host, and more. Each section of the code plays a @@ -12,7 +12,7 @@ critical role in handling URL data. ### Initialization -The code begins by including the necessary header for the Lexbor URL library and +The code begins by including the necessary header for the `lexbor` URL library and defining a static callback function. In the `main` function, several variables are declared, including a pointer to `lxb_url_t`, an instance of `lxb_url_parser_t`, and `lxb_unicode_idna_t`. @@ -109,9 +109,9 @@ URL memory, ensuring that no resources are leaked. ### Conclusion -The example succinctly demonstrates the capabilities of the Lexbor URL parsing +The example succinctly demonstrates the capabilities of the `lexbor` URL parsing library, showcasing how to initialize the parser, handle a complex URL with Unicode characters, and serialize its components. Each part of the code works harmoniously to show how flexible and powerful URL handling can be in modern C -programming with the Lexbor library. The proper initialization, error handling, +programming with the `lexbor` library. The proper initialization, error handling, and cleanup are crucial for robust application development. \ No newline at end of file diff --git a/source/examples/url/relative.md b/source/examples/url/relative.md index 6c26995..dc76f6a 100644 --- a/source/examples/url/relative.md +++ b/source/examples/url/relative.md @@ -4,7 +4,7 @@ This article provides an explanation of the URL parsing example found in the source file [lexbor/url/relative.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/url/relative.c). The example demonstrates the parsing of a relative URL based on a provided base -URL using the lexbor library. It outlines the setup of the URL parser, handling +URL using the `lexbor` library. It outlines the setup of the URL parser, handling of input strings, and the serialization of various components of the parsed URL. ## Code Breakdown @@ -100,7 +100,7 @@ utilized during the parsing are properly released. ### Conclusion The provided example illustrates the process of relative URL parsing using the -lexbor library. From initializing the parser to serializing specific components +`lexbor` library. From initializing the parser to serializing specific components of the URL, each step is crucial for accurate URL handling in applications. The careful management of memory and resources also highlights best practices in programming with C. \ No newline at end of file From 5a27c7f3ef25d0a199f3e091b43b43b4fa0a1766 Mon Sep 17 00:00:00 2001 From: Toxypi Date: Sat, 28 Sep 2024 19:23:04 +0100 Subject: [PATCH 7/9] Normalized example structure, content. --- source/examples/css/StyleSheet.md | 143 ++++----- .../examples/css/selectors/list_easy_way.md | 126 ++++---- .../examples/css/selectors/list_fast_way.md | 188 ++++++++---- source/examples/css/syntax/simple_colorize.md | 242 ++++++++++----- .../css/syntax/structure_parse_file.md | 214 +++++++++---- .../css/syntax/tokenizer/chunks_stdin.md | 136 ++++----- .../css/syntax/tokenizer/from_file.md | 136 +++------ .../css/syntax/tokenizer/print_raw.md | 147 ++++----- .../examples/encoding/buffer/decode/decode.md | 100 +++--- .../encoding/buffer/decode/decoder.md | 159 ++++++---- .../encoding/buffer/decode/validate.md | 137 ++++----- .../examples/encoding/buffer/encode/encode.md | 123 +++----- .../encoding/buffer/encode/encoder.md | 182 +++++------ .../encoding/buffer/encode/validate.md | 188 +++++++----- source/examples/encoding/buffer/from_to.md | 259 ++++++++++------ source/examples/encoding/data_by_name.md | 111 ++++--- .../examples/encoding/single/decode/decode.md | 171 ++++------- .../encoding/single/decode/decoder.md | 187 ++++-------- .../encoding/single/decode/validate.md | 146 ++++----- .../examples/encoding/single/encode/encode.md | 125 ++++---- .../encoding/single/encode/encoder.md | 169 +++++----- .../encoding/single/encode/validate.md | 149 +++++---- source/examples/encoding/single/from_to.md | 218 +++++++------ source/examples/html/document_parse.md | 93 ++---- source/examples/html/document_parse_chunk.md | 108 +++---- source/examples/html/document_title.md | 158 +++++----- source/examples/html/element_attributes.md | 163 +++++----- source/examples/html/element_create.md | 175 +++++++---- source/examples/html/element_innerHTML.md | 97 +++--- source/examples/html/elements_by_attr.md | 185 +++++------ .../examples/html/elements_by_class_name.md | 114 +++---- source/examples/html/elements_by_tag_name.md | 142 ++++----- source/examples/html/encoding.md | 144 +++++---- source/examples/html/html2sexpr.md | 288 +++++++++++++----- source/examples/html/parse.md | 156 ++++++---- source/examples/html/parse_chunk.md | 103 +++---- source/examples/html/tokenizer/callback.md | 125 ++++---- source/examples/html/tokenizer/simple.md | 207 +++++++++---- .../examples/html/tokenizer/tag_attributes.md | 197 +++++++----- source/examples/html/tokenizer/text.md | 151 ++++----- source/examples/punycode/decode.md | 157 +++++----- source/examples/punycode/encode.md | 141 +++++---- source/examples/selectors/easy_way.md | 158 ++++++---- source/examples/selectors/normal_way.md | 160 +++++----- source/examples/selectors/unique_nodes.md | 216 ++++++++----- source/examples/styles/attribute_style.md | 162 ++++++---- source/examples/styles/events_insert.md | 157 +++++----- source/examples/styles/stylesheet.md | 176 +++++------ source/examples/styles/walk.md | 247 ++++++++++----- source/examples/unicode/idna_to_ascii.md | 160 +++++----- source/examples/unicode/normalization_form.md | 151 +++++---- .../unicode/normalization_form_stdin.md | 170 ++++++++--- source/examples/url/parse.md | 139 +++++---- source/examples/url/relative.md | 180 ++++++----- 54 files changed, 4795 insertions(+), 3941 deletions(-) diff --git a/source/examples/css/StyleSheet.md b/source/examples/css/StyleSheet.md index c859da7..9ff7def 100644 --- a/source/examples/css/StyleSheet.md +++ b/source/examples/css/StyleSheet.md @@ -1,70 +1,40 @@ -# CSS Stylesheet Parsing Example +# Parsing and Serializing CSS Stylesheet: Example -This article explains the example code within the file -[lexbor/css/StyleSheet.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/StyleSheet.c), -which demonstrates how to use the `lexbor` library to read and parse a CSS -stylesheet. The code showcases the steps required to initialize the parser, read -the CSS data from a file, parse the stylesheet, and serialize the resulting -object. +This article explains an intermediate-to-advanced example code that demonstrates +how to parse and serialize a CSS stylesheet using the `lexbor` library. The +example can be found in the file `lexbor/css/StyleSheet.c`. -## Code Breakdown +The provided code example demonstrates how to read a CSS file, parse it using +the `lexbor` library, and then serialize the parsed CSS back to a string. This +example is valuable for developers looking to understand how to interact with +CSS data programmatically using `lexbor`. -### Includes and Function Declaration - -The code begins by including the necessary headers: `base.h` for foundational -functionalities and `lexbor/core/fs.h` and `lexbor/css/css.h` for file system -operations and CSS processing respectively. - -### Callback Function - -A callback function is defined that takes a pointer to character data, its -length, and a context pointer as parameters: - -```c -lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { - printf("%.*s", (int) len, data); - return LXB_STATUS_OK; -} -``` - -This function will be used later to output the serialized CSS rules. It prints -the data passed to it, formatted to handle the length of the string, ensuring -that only the relevant part of the buffer is printed. - -### Main Function - -The `main` function initializes the program and takes one argument: the path to -a CSS file. It begins by checking if the number of arguments is correct and -printing usage instructions if not: - -```c -if (argc != 2) { - fprintf(stderr, "Usage:\n"); - fprintf(stderr, "\tStyleSheet \n"); - FAILED("Invalid number of arguments"); -} -``` +## Key Code Sections ### Reading the CSS File -Next, the code reads the contents of the specified CSS file into memory: +The first significant operation in the code is reading the contents of a CSS +file. This is done using the `lexbor_fs_file_easy_read` function which reads the +contents into memory. ```c fl = (const lxb_char_t *) argv[1]; + css = lexbor_fs_file_easy_read(fl, &css_len); if (css == NULL) { FAILED("Failed to read CSS file"); } ``` -The `lexbor_fs_file_easy_read` function loads the file into the `css` buffer, -and the length of the data is stored in `css_len`. If reading the file fails, an -error message is displayed. +Here, `argv[1]` is expected to contain the path to the CSS file. The function +`lexbor_fs_file_easy_read` reads the file into a dynamically allocated buffer, +with `css_len` capturing the length of the data. If the file read fails, the +program exits with an error. -### Parsing the CSS +### Initializing the Parser -After successfully loading the CSS data, a CSS parser is created and -initialized: +Next, the code initializes a `lexbor` CSS parser. This involves creating a +parser instance and initializing it. ```c parser = lxb_css_parser_create(); @@ -74,36 +44,34 @@ if (status != LXB_STATUS_OK) { } ``` -The parser initialization must succeed; otherwise, the program exits early with -an error message. +First, a new parser instance is created using `lxb_css_parser_create()`. The +`lxb_css_parser_init` function initializes this parser. If the initialization +fails, an error is reported and the program exits. -### StyleSheet Parsing +### Parsing the Stylesheet -The actual parsing occurs with the following line: +Once the parser is ready, the next task is to parse the contents of the CSS +file. ```c sst = lxb_css_stylesheet_parse(parser, css, css_len); -``` - -Here, `lxb_css_stylesheet_parse` processes the loaded CSS content and generates -a stylesheet object, `sst`. If parsing fails, the program will exit. -### Memory Management - -Following the parsing step, memory for the CSS buffer is freed, and the parser -is destroyed: - -```c (void) lexbor_free(css); (void) lxb_css_parser_destroy(parser, true); + +if (sst == NULL) { + FAILED("Failed to parse CSS"); +} ``` -This cleanup is essential to avoid memory leaks in the application. +The function `lxb_css_stylesheet_parse` parses the CSS data stored in the buffer +`css` with length `css_len`. After parsing, the buffer is freed and the parser +is destroyed. If parsing fails, the program reports an error and exits. -### Serializing the Output +### Serializing the Stylesheet -The code then serializes the stylesheet and outputs the rules using the -previously defined callback: +After parsing the stylesheet, the example serializes it back to a string using a +callback function. ```c status = lxb_css_rule_serialize(sst->root, callback, NULL); @@ -112,23 +80,40 @@ if (status != LXB_STATUS_OK) { } ``` -This process invokes the callback for each rule in the stylesheet, allowing for -customizable output handling. +The function `lxb_css_rule_serialize` walks through the stylesheet rules, +serializing each one. The `callback` function is called for each chunk of data +during serialization. If an error occurs during serialization, the program +reports it and exits. -### Final Cleanup +### The Callback Function -Finally, the stylesheet object is destroyed to free up resources: +The callback function is straightforward but crucial for outputting the +serialized data. ```c -(void) lxb_css_stylesheet_destroy(sst, true); +lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx) +{ + printf("%.*s", (int) len, data); + return LXB_STATUS_OK; +} ``` -The program concludes successfully by returning `EXIT_SUCCESS`. +This function simply prints each chunk of serialized data to the standard +output. The `printf` function uses the precision field to handle the length of +data correctly. + +## Notes + +- Ensure that the CSS file exists and is accessible. +- Error handling is fundamental when dealing with file operations and parsing. +- The example provides a clear pathway from reading a file to parsing and + serializing CSS. ## Summary -In this example, a CSS file is read, parsed, and its contents serialized using -the `lexbor` library. Each significant section of the code has been explained to -provide clarity on the parsing process and resource management. By following -these steps, developers can incorporate CSS parsing capabilities into their -applications using `lexbor`. \ No newline at end of file +This example demonstrates effectively how to use the `lexbor` library to handle +CSS files. It highlights reading a file, parsing the CSS content, and +serializing the parsed content back to a string. Understanding this example +enables developers to manage CSS data programmatically with `lexbor`, which can +be extended and integrated into larger projects dealing with CSS manipulation. \ No newline at end of file diff --git a/source/examples/css/selectors/list_easy_way.md b/source/examples/css/selectors/list_easy_way.md index fe38e23..de95de4 100644 --- a/source/examples/css/selectors/list_easy_way.md +++ b/source/examples/css/selectors/list_easy_way.md @@ -1,48 +1,17 @@ -# CSS Selector Parsing Example +# Parsing and Serializing CSS Selectors with lexbor: Example -This article provides an in-depth explanation of the code found in -`list_easy_way.c`, which demonstrates how to use the `lexbor` library for parsing -CSS selectors. The code illustrates the steps involved in initializing a parser, -parsing a CSS selector string, and handling the results and logs. +This article discusses a C code example from the `lexbor` library, specifically focusing on the file `lexbor/css/selectors/list_easy_way.c`. The example demonstrates how to parse CSS selectors and serialize them using the capabilities provided by `lexbor`. -## Code Overview +In this example, we'll cover the process of creating a CSS parser using `lexbor`, parsing a complex selector string, and then serializing it back to a readable form. The workflow includes initialization of the parser, parsing the selector, error handling, and cleanup of resources. -The example begins by including the necessary header file from the lexbor CSS -library. The main purpose of this code is to showcase the parsing of a CSS -selector string, specifically `:has(div, :not(as, 1%, .class), #hash)`, using -the lexbor's CSS parser. +## Key Code Sections -## Key Sections of the Code +### Initialization of the CSS Parser -### Callback Function - -The `callback` function is defined to handle output during the serialization -process of the CSS selector list. It takes three parameters: a character pointer -to the data, the length of that data, and a context pointer. Inside the -function, the data is printed to the standard output using `printf`, formatted -to respect the length provided. - -```c -lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { - printf("%.*s", (int) len, (const char *) data); - return LXB_STATUS_OK; -} -``` - -### Main Function - -The `main` function begins by declaring variables for the parser and the -selector list. It initializes the necessary constants for indentation used in -log formatting and specifies the CSS selector string to be parsed. - -#### Parser Initialization - -A parser is created with `lxb_css_parser_create()`, and its initialization is -performed with `lxb_css_parser_init()`. The code checks the return status of the -initialization and exits gracefully if there is an issue, preventing further -execution with an invalid parser instance. +First, we need to create and initialize a CSS parser. ```c +lxb_css_parser_t *parser; parser = lxb_css_parser_create(); status = lxb_css_parser_init(parser, NULL); if (status != LXB_STATUS_OK) { @@ -50,14 +19,14 @@ if (status != LXB_STATUS_OK) { } ``` -#### Parsing the Selector +Here, `lxb_css_parser_create` allocates memory for the parser, while `lxb_css_parser_init` initializes the parser. If initialization fails (`status != LXB_STATUS_OK`), the program exits with `EXIT_FAILURE`. + +### Parsing CSS Selectors -The parsing of the CSS selector occurs with the function -`lxb_css_selectors_parse()`, which takes the parser, the selector string, and -its length as arguments. The status of the parser is checked afterward to ensure -that the parsing was successful. +Next, we define our CSS selectors string and parse it. ```c +static const lxb_char_t slctrs[] = ":has(div, :not(as, 1%, .class), #hash)"; list = lxb_css_selectors_parse(parser, slctrs, sizeof(slctrs) / sizeof(lxb_char_t) - 1); if (parser->status != LXB_STATUS_OK) { @@ -66,40 +35,69 @@ if (parser->status != LXB_STATUS_OK) { } ``` -#### Selector List Serialization +The function `lxb_css_selectors_parse` accepts the parser, the CSS selector string, and its length. It returns a pointer to `lxb_css_selector_list_t`, which represents the parsed selector list. Error handling confirms if the parsing was successful by checking `parser->status`. -The parsed selector list is then serialized using -`lxb_css_selector_serialize_list()`, which invokes the previously defined -`callback` function. This outputs the result of the serialization to standard -output. +### Callback Function for Serialization + +We use a callback function to process the serialized data. ```c +lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx) +{ + printf("%.*s", (int) len, (const char *) data); + return LXB_STATUS_OK; +} +``` + +The callback simply prints the serialized data to the standard output. + +### Serializing the Selector List + +We serialize the list of selectors to a readable form. + +```c +printf("Result: "); (void) lxb_css_selector_serialize_list(list, callback, NULL); +printf("\n"); ``` -### Handling Logs +The `lxb_css_selector_serialize_list` function processes each selector in the list, calling the `callback` for each. -If there are any logs generated during parsing, they are checked with -`lxb_css_log_length()`, and the log is serialized in a similar manner, making -use of the callback function and proper indentation for the displayed log. +### Logging and Error Messages -### Cleanup +If there are any log messages, we serialize and print them. -Finally, the example demonstrates proper resource management by destroying the -parser and the associated memory. This is crucial in C programming to prevent -memory leaks. The parser is destroyed first, followed by the cleanup of the -selector list's memory. +```c +if (lxb_css_log_length(lxb_css_parser_log(parser)) != 0) { + static const lxb_char_t indent[] = " "; + static const size_t indent_length = sizeof(indent) / sizeof(lxb_char_t) - 1; + + printf("Log:\n"); + (void) lxb_css_log_serialize(parser->log, callback, NULL, indent, indent_length); + printf("\n"); +} +``` + +Here, `lxb_css_log_serialize` formats any log messages using the provided indentation and then calls the callback for each log entry. + +### Cleanup Resources + +Finally, we must clean up allocated resources. ```c (void) lxb_css_parser_destroy(parser, true); lxb_css_selector_list_destroy_memory(list); ``` -## Conclusion +`lxb_css_parser_destroy` and `lxb_css_selector_list_destroy_memory` ensure that all memory allocated for the parser and selector list is properly freed. + +## Notes + +- The example demonstrates robust error handling tied with each crucial step. +- Serialization is handled via callback functions, which offers flexibility for different output handling needs. +- Proper memory management is critical, underscored by the cleanup section. + +## Summary -This example effectively showcases the functionality of the lexbor CSS library -for parsing CSS selectors. From initializing the parser to handling logs and -cleaning up memory, each step is crucial for ensuring that the program runs -efficiently and correctly. The structured approach presented in the code -promotes good practices in C programming, particularly regarding memory -management and error handling. \ No newline at end of file +This example from `lexbor` showcases a complete cycle from parsing a complex CSS selector string to its serialization and error logging. For users looking to leverage the `lexbor` library, understanding this example is fundamental as it highlights key functionalities: parser creation, selector parsing, serialization, and resource management. By mastering these steps, developers can efficiently integrate CSS parsing capabilities into their applications. \ No newline at end of file diff --git a/source/examples/css/selectors/list_fast_way.md b/source/examples/css/selectors/list_fast_way.md index 9eba765..6d1b7f6 100644 --- a/source/examples/css/selectors/list_fast_way.md +++ b/source/examples/css/selectors/list_fast_way.md @@ -1,83 +1,147 @@ -# CSS Selectors Parsing Example +# Understanding Fast CSS Selector Parsing with `lexbor`: Example + +This article provides a detailed explanation of the `lexbor/css/selectors/list_fast_way.c` +example from the `lexbor` library, focusing on its intermediate-to-advanced +aspects. The example demonstrates how to efficiently parse and process CSS selectors +using `lexbor`. We will look into the key sections, including initialization, +parsing, and serialization of selectors, highlighting the intent and logic behind +these implementations. -This article explains the functionality present in the `list_fast_way.c` source -file of the lexbor CSS library, illustrating how to parse CSS selectors -effectively. The primary goal of the code is to demonstrate the parsing of -various CSS selectors and report the results, including any parsing warnings -that may arise. +## Key Code Sections -## Code Structure Overview +### Initialization of Memory and Parser -The entire program is structured around a single function `main`, which is the -entry point when the program is executed. Several components of the code are -critical to understanding how it prepares for and executes CSS selector parsing. +The example starts with the initialization of memory and parser objects. Here are the +relevant portions of the code: -### Including Required Headers +```c +lxb_css_memory_t *memory; +lxb_css_parser_t *parser; -The program begins by including `lexbor/css/css.h`, which is essential as it -provides the functions, types, and structures required for working with the -lexbor CSS parser. +/* Memory for all parsed structures. */ +memory = lxb_css_memory_create(); +status = lxb_css_memory_init(memory, 128); +if (status != LXB_STATUS_OK) { + return EXIT_FAILURE; +} -### Callback Function +/* Create parser. */ +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + return EXIT_FAILURE; +} +``` -The `callback` function is defined to handle logging messages that arise during -the CSS parsing process. It takes in data and its length, printing the message -using `printf`. This function is a straightforward implementation that merely -outputs the parsed messages but can be extended for more complex handlers if -needed. +The `lxb_css_memory_create` and `lxb_css_memory_init` functions are used to create and +initialize a memory pool that will be used for storing parsed structures. The parser is +created with `lxb_css_parser_create` and initialized with `lxb_css_parser_init`. These +steps ensure that memory management is handled efficiently. -### Main Function Logic +### Memory Binding to Parser -Inside the `main` function, the following key operations are performed: +One crucial aspect is binding the memory pool to the parser, preventing redundant memory +allocations. The following lines achieve this: -1. **Memory Setup:** - - A memory object is created using `lxb_css_memory_create`, which acts as a - buffer for all parsed structures. - - Initialization of memory is conducted with `lxb_css_memory_init`, setting - aside an initial block of 128 bytes. +```c +/* Bind memory to parser */ +lxb_css_parser_memory_set(parser, memory); +``` -2. **Parser Initialization:** - - A CSS parser object is instantiated with `lxb_css_parser_create` and - initialized using `lxb_css_parser_init`. +By binding the memory object to the parser using `lxb_css_parser_memory_set`, the example +ensures that all parsed structures share the same memory pool, promoting efficiency and +preventing memory fragmentation. -3. **Binding the Memory and Selectors:** - - It is crucial to bind the created memory object to the parser to prevent - memory allocation issues during parsing. This is achieved using - `lxb_css_parser_memory_set`. - - Similarly, a selectors object is created and initialized. This object must - also be bound to the parser, so its data can be managed correctly while - parsing CSS selectors. +### Creating and Binding Selectors -### Parsing CSS Selectors +Selectors are created and bound to the parser, ensuring streamlined parsing operations: -The program defines a static array of CSS selectors to be parsed. Each selector -is processed in a loop, where: +```c +lxb_css_selectors_t *selectors; +selectors = lxb_css_selectors_create(); +status = lxb_css_selectors_init(selectors); +if (status != LXB_STATUS_OK) { + return EXIT_FAILURE; +} +lxb_css_parser_selectors_set(parser, selectors); +``` -- The parser attempts to parse each selector using `lxb_css_selectors_parse`. -- The output is assessed to determine if parsing was successful or if there were - any warnings or errors. Any issues are logged using the `callback` function, - which provides informative feedback. +The selectors object is created and initialized through `lxb_css_selectors_create` and +`lxb_css_selectors_init`. Binding the selectors to the parser with +`lxb_css_parser_selectors_set` prevents the creation of new selectors objects on each +parsing operation. -### Resource Cleanup +### Parsing Selectors -After all parsing operations, the program ensures to destroy the selectors and -parser resources, calling `lxb_css_selectors_destroy` and -`lxb_css_parser_destroy`. This step is crucial in managing memory and avoiding -leaks in longer-running applications. +The central part of the example involves parsing the CSS selectors provided in an array: -### Serialization of Results +```c +const char *slctrs[] = { ":not()", "div #hash [refs=i]", "div.class", ... }; -Finally, the parsed selector lists are serialized and printed. For each -selector, the program checks if any parsing results were generated by the -`lxb_css_selector_serialize_list` function. If a selector results in an empty -list, it is noted accordingly. +for (i = 0; slctrs[i] != NULL; i++) { + lists[i] = lxb_css_selectors_parse(parser, (const lxb_char_t *) slctrs[i], + strlen(slctrs[i])); + if (parser->status != LXB_STATUS_OK) { + /* Handle parse error */ + } else { + /* Handle parse success */ + } +} +``` -### Conclusion +The array `slctrs` contains various CSS selectors to parse. The `lxb_css_selectors_parse` +function is called for each selector, and its result is stored in the `lists` array. The +parser's status is checked to determine if the parsing was successful. -The `list_fast_way.c` example serves as a practical guide for developers looking -to understand how to parse CSS selectors using the `lexbor` library. By -emphasizing memory management, proper initialization, and error handling, this -example lays a solid foundation for further applications of the library in -real-world projects. The code harnesses the flexibility of lexbor while -maintaining clarity and efficiency in parsing operations, making it an -invaluable resource for CSS-related development. \ No newline at end of file +### Log Serialization + +In case of errors or warnings during parsing, the logs are serialized and printed: + +```c +(void) lxb_css_log_serialize(parser->log, callback, NULL, indent, indent_length); +``` + +The `lxb_css_log_serialize` function serializes the log information, using a `callback` +to output the serialized data. This helps in diagnosing issues during the parsing process. + +### Cleanup Resources + +Once parsing is complete, the resources associated with the parser and selectors are +destroyed: + +```c +(void) lxb_css_selectors_destroy(selectors, true); +(void) lxb_css_parser_destroy(parser, true); +``` + +Destroying these resources ensures that any allocated memory is properly freed, preventing +memory leaks. + +### Outputting Results + +The parsed selector lists are then serialized and outputted: + +```c +for (i = 0; slctrs[i] != NULL; i++) { + if (lists[i] != NULL) { + (void) lxb_css_selector_serialize_list(lists[i], callback, NULL); + } +} +``` + +Each parsed selector list is serialized using `lxb_css_selector_serialize_list`, and the +results are printed. This demonstrates the outcomes of the parsing operations. + +## Notes + +- Binding memory and selectors to the parser improves efficiency and prevents + redundant memory allocations. +- Proper error handling and log serialization provide insights into parsing issues. +- Resource cleanup is essential to prevent memory leaks. + +## Summary + +This example illustrates efficient parsing of CSS selectors using the `lexbor` library by +binding memory and selectors to the parser, parsing various selectors, handling errors, +and serializing the results. Understanding these techniques is valuable for developers +looking to leverage `lexbor` for high-performance CSS parsing in their applications. \ No newline at end of file diff --git a/source/examples/css/syntax/simple_colorize.md b/source/examples/css/syntax/simple_colorize.md index 4977966..8fc5ec2 100644 --- a/source/examples/css/syntax/simple_colorize.md +++ b/source/examples/css/syntax/simple_colorize.md @@ -1,93 +1,199 @@ -# CSS Syntax Parsing Example +# CSS Lexer with Colorized Output: Example -This article provides an explanation of a code example from the source file -[lexbor/css/syntax/simple_colorize.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/simple_colorize.c). -The code implements a simple CSS parser that reads a CSS file, parses its -content, and provides color-coded output for each type of CSS rule and -declaration using ANSI escape codes. +This example demonstrates how to use the `lexbor` library to parse a CSS file and provide colorized output based on the different types of CSS tokens encountered. The code is found in the `lexbor/css/syntax/simple_colorize.c` file. The primary objective of this example is to showcase how to set up a CSS parser, process different CSS rules, and colorize the output dynamically to reflect the structure of CSS syntax. -## Structure of the Program +## Key Code Sections -The main function serves as the entry point of the program, where the user is -expected to provide a CSS file as an argument. The program then reads this file -into memory, initializes a CSS parser, and calls a function to parse the CSS -content. +### Parsing Initialization -### Key Components +First, let's look at the initialization process for the CSS parser, file reading, and the initial call to the parsing function: -1. **Initialization and File Handling**: - - The program checks for the correct number of command-line arguments. - - It leverages the `lexbor_fs_file_easy_read` function to read the CSS - content from the specified file into a buffer. +```c +if (argc != 2) { + fprintf(stderr, "Usage:\n"); + fprintf(stderr, "\tcolorize \n"); + FAILED("Invalid number of arguments"); +} -2. **CSS Parser Setup**: - - It creates an instance of a CSS parser using `lxb_css_parser_create`. - - The parser is then initialized with `lxb_css_parser_init`. +fl = (const lxb_char_t *) argv[1]; -3. **CSS Parsing Function**: - - The function `css_parse` is called, which sets up the parsing context and - starts the rule parsing process. +css = lexbor_fs_file_easy_read(fl, &css_len); +if (css == NULL) { + FAILED("Failed to read CSS file"); +} -4. **Token Handling**: - - Several callback functions are defined to handle the various types of CSS - syntax tokens, including qualified rules, at-rules, and declaration blocks. +parser = lxb_css_parser_create(); +status = lxb_css_parser_init(parser, NULL); +if (status != LXB_STATUS_OK) { + FAILED("Failed to create CSS Parser"); +} -## Detailed Code Explanation +status = css_parse(parser, css, css_len); +``` -### CSS Parsing Function (`css_parse`) +This part checks for a single command-line argument, reads the CSS file, initializes the `lexbor` CSS parser, and starts the parsing process using `css_parse`. -The `css_parse` function initializes a context structure `css_ctx_t`, which -tracks the current offset within the CSS data while parsing. It sets the parsing -buffer using `lxb_css_parser_buffer_set` and begins the rule parsing using -`lxb_css_syntax_parser_list_rules_push`. +### `css_parse` Function -The call to `lxb_css_syntax_parser_run` runs the parser, which processes the CSS -tokens based on the rules specified. This function returns a status that -indicates whether the parsing succeeded or failed. +The main parsing logic is within the `css_parse` function: -### Token Callbacks +```c +static lxb_status_t +css_parse(lxb_css_parser_t *parser, const lxb_char_t *data, size_t length) +{ + css_ctx_t ctx; + lxb_css_syntax_rule_t *stack; -The program defines various inline functions and callbacks to handle the output -of tokens during parsing: + ctx.data = data; + ctx.offset = 0; -- **`css_print_token`** and **`css_print_token_offset`**: These functions print - a CSS token along with proper formatting. They utilize ANSI escape codes to - change text color in the console output for better visualization. + lxb_css_parser_buffer_set(parser, data, length); -### Rule Handling + stack = lxb_css_syntax_parser_list_rules_push(parser, NULL, NULL, + &css_list_rules, + &ctx, true, + LXB_CSS_SYNTAX_TOKEN_UNDEF); + if (stack == NULL) { + return LXB_STATUS_ERROR; + } -The parser is equipped with callbacks for handling different CSS rules: + printf("\n"); -- **`css_list_rules_state`**: This function handles the state of list rules and - is responsible for printing the state with a specific color. - -- **`css_at_rule_state`** and **`css_at_rule_block`**: These handle at-rules and - their blocks, printing the corresponding tokens and managing the nested - structure of CSS. - -- **`css_qualified_rule_state`** and **`css_qualified_rule_block`**: Manage the - parsing of qualified rules and their associated declaration blocks, printing - relevant information while maintaining contextual awareness of the current - location within the CSS input. + return lxb_css_syntax_parser_run(parser); +} +``` -### Declarations Handling +This function sets up the parser buffer and pushes the initial parsing rules onto the stack using `lxb_css_syntax_parser_list_rules_push`. It then runs the parser by calling `lxb_css_syntax_parser_run`. -The parsing of declarations involves several parts: +### Callback Structures -- **`css_declarations_name`** and **`css_declarations_value`**: Handle the CSS - property names and values, respectively, printing them in different colors to - distinguish visually between different parts of declarations. +The following structures define callbacks for handling different CSS syntactic elements: -### Memory Management +```c +static const lxb_css_syntax_cb_at_rule_t css_at_rule = { + .state = css_at_rule_state, + .block = css_at_rule_block, + .failed = lxb_css_state_failed, + .end = css_at_rule_end +}; -The code ensures to clean up the allocated memory for the CSS data buffer and -parser instance by calling `lexbor_free` and `lxb_css_parser_destroy`, which -prevents memory leaks. +static const lxb_css_syntax_cb_qualified_rule_t css_qualified_rule = { + .state = css_qualified_rule_state, + .block = css_qualified_rule_block, + .failed = lxb_css_state_failed, + .end = css_qualified_rule_end +}; -## Conclusion +static const lxb_css_syntax_cb_list_rules_t css_list_rules = { + .cb.state = css_list_rules_state, + .cb.failed = lxb_css_state_failed, + .cb.end = css_list_rules_end, + .next = css_list_rules_next, + .at_rule = &css_at_rule, + .qualified_rule = &css_qualified_rule +}; +``` -This example illustrates how to implement a simple CSS parser that reads a file, -processes its content into structured tokens, and outputs the result with visual -cues. The use of callback functions and context structures allows for flexible -and extendable parsing logic, suitable for more complex scenarios in CSS syntax -processing. \ No newline at end of file +These structures define callbacks for handling different rules such as at-rules, qualified rules, and lists of rules. They point to specific functions that handle each part of the CSS token processing. + +### Handling Rules with Color + +Let's examine how specific rules are handled and colorized, starting with the at-rule state: + +```c +static bool +css_at_rule_state(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + css_print_token_offset(token, ctx); + + printf("\033[35m"); + css_print_token(token, ctx); + + lxb_css_syntax_parser_consume(parser); + token = lxb_css_syntax_parser_token(parser); + + printf("\033[33m"); + + css_consule_tokens(parser, token, ctx); + + printf("\033[39m"); + + return lxb_css_parser_success(parser); +} +``` + +Here, the at-rule state function sets the color (using ANSI escape codes) and prints the token while consuming and processing subsequent tokens within an at-rule block. + +### Token Serialization + +The function `css_consule_tokens` is used to serialize and print tokens: + +```c +lxb_inline void +css_consule_tokens(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + while (token != NULL && token->type != LXB_CSS_SYNTAX_TOKEN__END) { + (void) lxb_css_syntax_token_serialize(token, token_cb_f, ctx); + + lxb_css_syntax_parser_consume(parser); + token = lxb_css_syntax_parser_token(parser); + } +} +``` + +It continuously consumes and prints each token until the end of the input or a terminating token is reached. + +### Coloring Declaration Names and Values + +The following functions handle the coloring of CSS property names and values: + +```c +static bool +css_declarations_name(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + css_print_token_offset(token, ctx); + + printf("\033[31m"); + + css_consule_tokens(parser, token, ctx); + + printf("\033[39m"); + + return lxb_css_parser_success(parser); +} + +static bool +css_declarations_value(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + css_print_token_offset(token, ctx); + + printf("\033[36m"); + + while (token != NULL && token->type != LXB_CSS_SYNTAX_TOKEN__END) { + (void) lxb_css_syntax_token_serialize(token, token_cb_f, ctx); + + lxb_css_syntax_parser_consume(parser); + token = lxb_css_syntax_parser_token(parser); + } + + printf("\033[39m"); + + return lxb_css_parser_success(parser); +} +``` + +These functions respectively color CSS property names in red and their values in cyan. + +## Notes + +- **Color Codes**: The example uses ANSI escape codes (e.g., `\033[31m`) to color the output, which may not be supported on all terminals. +- **Memory Management**: It is critical to properly destroy and free the parser and allocated memory to prevent leaks. +- **Error Handling**: The example includes fundamental error handling mechanisms but may require enhancements for robustness in production systems. + +## Summary + +This example illustrates how to use the `lexbor` library effectively for parsing and colorizing CSS. The key takeaways include setting up the parser, defining callback structures to handle different CSS rules, and utilizing token serialization and ANSI escape codes for colored output. Understanding these principles helps leverage the `lexbor` library for more complex CSS parsing and processing tasks. \ No newline at end of file diff --git a/source/examples/css/syntax/structure_parse_file.md b/source/examples/css/syntax/structure_parse_file.md index f564401..58a65b6 100644 --- a/source/examples/css/syntax/structure_parse_file.md +++ b/source/examples/css/syntax/structure_parse_file.md @@ -1,89 +1,169 @@ -# CSS Syntax Parser Example +# Parsing CSS Syntax from File: Example -This article provides an overview of the code located in -[lexbor/css/syntax/structure_parse_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/structure_parse_file.c), -which implements a CSS syntax parser using the `lexbor` library. The primary goal -of this code is to parse CSS syntax rules and declarations, handling various -states and transitions within the parsing process. +This example demonstrates how to parse a CSS file and interpret its syntax using the `lexbor` library. The provided C code, located in `lexbor/css/syntax/structure_parse_file.c`, reads a CSS file, parses its content, and handles different CSS rules and declarations. The primary aim of this example is to show the steps involved in setting up a `lexbor` CSS parser, defining necessary callbacks, and executing the parsing process. This detailed explanation walks through the key functionality and sophisticated use of the `lexbor` library functions and data types. -## Code Overview +## Key Code Sections -The code starts with the inclusion of headers that bring in necessary -definitions and functions from the `lexbor` library. It defines multiple functions -and callback structures that manage the parsing of different CSS constructs. -Central to the code is the `main` function, which serves as the entry point of -the application. +### Initialization and Main Function -### Main Function +At the heart of the program is the `main()` function, which initializes the CSS parser and reads the CSS input file. -The `main` function performs several key operations: +```c +int +main(int argc, const char *argv[]) +{ + size_t css_len; + lxb_char_t *css; + lxb_status_t status; + lxb_css_parser_t *parser; + const lxb_char_t *fl; -1. **Argument Validation**: It checks if the number of command-line arguments is - correct. If not, it prints usage instructions and exits the program. - -2. **File Reading**: It reads a CSS file specified by the user and stores its - contents into a variable `css`. If this reading fails, the program exits with - an error message. + if (argc != 2) { + fprintf(stderr, "Usage:\n"); + fprintf(stderr, "\tstructure_parse_file \n"); + FAILED("Invalid number of arguments"); + } -3. **Parser Initialization**: It creates and initializes a CSS parser instance. - If the initialization fails, the program reports an error and exits. + fl = (const lxb_char_t *) argv[1]; -4. **Parsing Execution**: The `css_parse` function is called with the parser and - the CSS data to carry out the parsing process. + css = lexbor_fs_file_easy_read(fl, &css_len); + if (css == NULL) { + FAILED("Failed to read CSS file"); + } -5. **Cleanup**: After the parsing is done, it releases allocated resources and - exits with success or failure status based on the parsing outcome. + parser = lxb_css_parser_create(); + status = lxb_css_parser_init(parser, NULL); + if (status != LXB_STATUS_OK) { + FAILED("Failed to create CSS Parser"); + } -### CSS Parsing Implementation + status = css_parse(parser, css, css_len); -The `css_parse` function is crucial as it sets up the parsing buffer and pushes -the initial parsing rules onto a stack. Here's a breakdown of its functionality: + (void) lexbor_free(css); + (void) lxb_css_parser_destroy(parser, true); -- **Set Buffer**: The parsing buffer of the parser is set with the provided CSS - data and its length. - -- **Push Rules**: The function uses the `lxb_css_syntax_parser_list_rules_push` - to initiate the parsing of list rules, which is a fundamental construct in - CSS. It expects a pointer to a set of callback functions that manage how the - list of rules is processed. + if (status != LXB_STATUS_OK) { + FAILED("Failed to parse CSS"); + } -- **Run Parser**: Finally, it triggers the parsing process with - `lxb_css_syntax_parser_run`, which advances through the tokens available in - the CSS data. + return EXIT_SUCCESS; +} +``` -### Callback Functions +In this segment, the program reads a CSS file, initializes the CSS parser, and invokes the `css_parse` function to start parsing. -The code defines a series of callback functions that manage specific CSS rules, -states, and declarations: +### Parsing Function -- **State Management**: Functions like `css_list_rules_state`, - `css_at_rule_state`, and `css_declarations_name` handle specific parser - states. Each of these functions typically logs the current processing step and - processes tokens of interest. They return a success status after handling the - tokens. +The `css_parse` function sets the buffer and pushes the initial rule stack to begin parsing. -- **Handling Blocks**: Functions such as `css_at_rule_block` and - `css_qualified_rule_block` manage blocks of CSS rules, utilizing the - `css_consule_tokens` function to process tokens within those blocks. These - functions also handle stack manipulations depending on the rule context, such - as pushing or popping a stack. +```c +static lxb_status_t +css_parse(lxb_css_parser_t *parser, const lxb_char_t *data, size_t length) +{ + lxb_css_syntax_rule_t *stack; -- **End States**: Functions like `css_list_rules_end` and `css_declarations_end` - signal the completion of various sections. These may log end messages or - perform any necessary cleanup. + lxb_css_parser_buffer_set(parser, data, length); -### Additional Utility Functions + stack = lxb_css_syntax_parser_list_rules_push(parser, NULL, NULL, + &css_list_rules, + NULL, true, + LXB_CSS_SYNTAX_TOKEN_UNDEF); + if (stack == NULL) { + return LXB_STATUS_ERROR; + } -The utility function `css_consule_tokens` is noteworthy. It iterates through -tokens and processes each one sequentially, calling -`lxb_css_syntax_token_serialize`, which presumably serializes or logs the token -data. This function also handles token consumption, facilitating smooth progress -through the parsing state. + return lxb_css_syntax_parser_run(parser); +} +``` -### Conclusion +Here, `lxb_css_parser_buffer_set` assigns the data to the parser, and `lxb_css_syntax_parser_list_rules_push` initializes the entry point for parsing, specifying callbacks for handling list rules. -The code contained in `structure_parse_file.c` offers a comprehensive -implementation of a CSS syntax parser with well-defined states and callbacks. -The use of systematic error handling and resource management provides stability -to the parsing process. By integrating these components, the `lexbor` library -enhances its ability to interpret and manipulate CSS effectively. \ No newline at end of file +### Callback: Handling List Rules + +Callbacks manage the state transitions and actions for different parts of the CSS syntax. For example, the `css_list_rules_state` is invoked when starting to process a list of rules. + +```c +static bool +css_list_rules_state(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + PRINT("Begin List Of Rules"); + + return lxb_css_parser_success(parser); +} + +static bool +css_list_rules_next(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + PRINT("Next List Of Rules"); + + return lxb_css_parser_success(parser); +} +``` + +These callbacks print messages indicating the start and continuation of rule listings in the CSS file and signify successful parsing. + +### Callback: Handling At-Rules + +At-rules (`@` rules) such as `@media` or `@keyframes` have dedicated callbacks. + +```c +static bool +css_at_rule_state(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + PRINT("Begin At-Rule Prelude"); + + css_consule_tokens(parser, token, ctx); + + printf("\n\n"); + + return lxb_css_parser_success(parser); +} + +static bool +css_at_rule_block(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + PRINT("Begin At-Rule Block"); + + css_consule_tokens(parser, token, ctx); + + printf("\n\n"); + + return lxb_css_parser_success(parser); +} +``` + +These functions print messages and consume tokens associated with at-rule prelude and block contexts. + +### Consuming Tokens + +The `css_consule_tokens` function processes tokens used across many callbacks to parse the token stream effectively. + +```c +lxb_inline void +css_consule_tokens(lxb_css_parser_t *parser, + const lxb_css_syntax_token_t *token, void *ctx) +{ + while (token != NULL && token->type != LXB_CSS_SYNTAX_TOKEN__END) { + (void) lxb_css_syntax_token_serialize(token, token_cb_f, ctx); + + lxb_css_syntax_parser_consume(parser); + token = lxb_css_syntax_parser_token(parser); + } +} +``` + +This loop continues consuming tokens until the end of token stream, serializing and printing each token. + +## Notes + +- **Initialization**: Correct initialization and cleanup of the parser are essential for avoiding memory leaks. +- **Callback Mechanism**: The versatile use of callbacks for various states (e.g., at-rules, declarations) makes it easy to extend the parser functionality. +- **Token Handling**: Efficient handling and processing of tokens ensure correct CSS parsing and interpretation. + +## Summary + +The example code in `lexbor/css/syntax/structure_parse_file.c` serves as an excellent illustration of parsing CSS files using the `lexbor` library. By walking through the setup, parsing mechanics, and token handling, one can gain a solid understanding of how to leverage `lexbor` for CSS parsing tasks. This example lays the foundation for more advanced CSS manipulation and analysis using `lexbor`. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/chunks_stdin.md b/source/examples/css/syntax/tokenizer/chunks_stdin.md index 91d140c..f19da4c 100644 --- a/source/examples/css/syntax/tokenizer/chunks_stdin.md +++ b/source/examples/css/syntax/tokenizer/chunks_stdin.md @@ -1,51 +1,34 @@ -# CSS Syntax Tokenizer Example +# Tokenizing CSS from Standard Input: Example -This article explains the implementation of a CSS syntax tokenizer in the file -[lexbor/css/syntax/tokenizer/chunks_stdin.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/chunks_stdin.c). -The code demonstrates how to read CSS data from standard input, tokenize it, and -output the identified token types along with their serialized representations. +The file `lexbor/css/syntax/tokenizer/chunks_stdin.c` demonstrates how to tokenize CSS data read from standard input using the `lexbor` library. This article will delve into the key parts of this example, explaining the purpose and workings of each section. -## Overview +## Key Code Sections -The main purpose of this example is to showcase the mechanics of the -`lxb_css_syntax_tokenizer`, a component provided by the `lexbor` library for -parsing CSS syntax. The example leverages standard input (stdin) to read CSS -input, processes the tokens through the tokenizer, and outputs details about -each token to the console. +### Callback for Token Serialization -## Code Breakdown - -### Includes and Definitions - -At the beginning of the file, necessary headers are included, such as -`lexbor/css/css.h`, which contains the definitions and interfaces for the CSS -parser. A small buffer size of 32 bytes is defined with `#define BUFFER_SIZE -32`, which limits the amount of data read from stdin at one time, making it -suitable for demonstration purposes. - -### Callback Function - -The `callback` function is defined to handle the serialized output of the -tokens: +The function `callback` is used to handle the serialized tokens. It simply prints the token data to the standard output. ```c -lxb_status_t callback(const lxb_char_t *data, size_t len, void *ctx) { +lxb_status_t +callback(const lxb_char_t *data, size_t len, void *ctx) +{ printf("%s", (const char *) data); + return LXB_STATUS_OK; } ``` -This function prints the serialized token data to the console and returns a -status indicating success. It serves as a simple mechanism to display token -information during parsing. +This demonstrates a basic usage of `lxb_css_syntax_token_serialize`, indicating how tokens will be rendered and processed. -### Chunk Callback Function +### Handling Input in Chunks -The `chunk_cb` function reads chunks of CSS data into a buffer and sets up the -tokenizer to consume these chunks: +The function `chunk_cb` reads data from standard input into a buffer, allowing the tokenizer to process it in chunks. This is particularly useful for handling large inputs gracefully. ```c -lxb_status_t chunk_cb(lxb_css_syntax_tokenizer_t *tkz, const lxb_char_t **data, const lxb_char_t **end, void *ctx) { +lxb_status_t +chunk_cb(lxb_css_syntax_tokenizer_t *tkz, const lxb_char_t **data, + const lxb_char_t **end, void *ctx) +{ size_t size; lxb_char_t *buff = ctx; @@ -53,7 +36,8 @@ lxb_status_t chunk_cb(lxb_css_syntax_tokenizer_t *tkz, const lxb_char_t **data, if (size != BUFFER_SIZE) { if (feof(stdin)) { tkz->eof = true; - } else { + } + else { return EXIT_FAILURE; } } @@ -65,19 +49,16 @@ lxb_status_t chunk_cb(lxb_css_syntax_tokenizer_t *tkz, const lxb_char_t **data, } ``` -The function first attempts to read a buffer full of CSS data from stdin. If the -end of input is reached, it marks the tokenizer's end-of-file (EOF) state. If an -error occurs during reading, it returns a failure status. The function -effectively prepares the data for the tokenizer by updating the pointed `data` -and `end` pointers. +This function fills a buffer with a fixed size (`BUFFER_SIZE`) from `stdin`, managing the end-of-file condition by setting `tkz->eof` when necessary. This function returns `LXB_STATUS_OK` if reading proceeds without errors. -### Main Function +### Tokenizing the Input -The `main` function orchestrates the initialization and the execution of the CSS -syntax tokenizer: +The `main` function initializes the CSS tokenizer, sets the chunk callback, and processes tokens in a loop until the end-of-file token is encountered. ```c -int main(int argc, const char *argv[]) { +int +main(int argc, const char *argv[]) +{ lxb_status_t status; lxb_css_syntax_token_t *token; lxb_css_syntax_tokenizer_t *tkz; @@ -93,57 +74,46 @@ int main(int argc, const char *argv[]) { } lxb_css_syntax_tokenizer_chunk_cb_set(tkz, chunk_cb, inbuf); -``` -This section starts by creating and initializing a tokenizer instance. If -initialization fails, it gracefully exits the process. Notably, it sets the -chunk callback function, associating it with the previously defined `chunk_cb` -and the input buffer `inbuf`. + do { + token = lxb_css_syntax_token(tkz); + if (token == NULL) { + PRINT("Failed to parse CSS"); + goto failed; + } -#### Token Processing Loop + name = lxb_css_syntax_token_type_name_by_id(token->type); + printf("%s: ", (const char *) name); -The main loop processes tokens until the EOF is reached: + lxb_css_syntax_token_serialize(token, callback, NULL); + printf("\n"); -```c -do { - token = lxb_css_syntax_token(tkz); - if (token == NULL) { - PRINT("Failed to parse CSS"); - goto failed; - } + type = lxb_css_syntax_token_type(token); - name = lxb_css_syntax_token_type_name_by_id(token->type); - printf("%s: ", (const char *) name); + lxb_css_syntax_token_consume(tkz); + } + while (type != LXB_CSS_SYNTAX_TOKEN__EOF); - lxb_css_syntax_token_serialize(token, callback, NULL); - printf("\n"); - - type = lxb_css_syntax_token_type(token); - lxb_css_syntax_token_consume(tkz); -} while (type != LXB_CSS_SYNTAX_TOKEN__EOF); -``` + lxb_css_syntax_tokenizer_destroy(tkz); -In this loop, it retrieves the next token from the tokenizer and checks for -parsing errors. If a token is successfully obtained, it retrieves and prints the -token's type name, serializes the token using the earlier defined `callback`, -and then consumes the token to prepare for the next cycle. This loop continues -until an EOF token is encountered. + return EXIT_SUCCESS; -### Cleanup +failed: -At the end of the function, the tokenizer is destroyed to free up allocated -resources: + lxb_css_syntax_tokenizer_destroy(tkz); -```c -lxb_css_syntax_tokenizer_destroy(tkz); + return EXIT_FAILURE; +} ``` -If any failures occur at various stages, the code ensures proper cleanup to -avoid memory leaks. +Here, the tokenizer is created and initialized, and the chunk callback is set with a buffer for data. The loop continues to fetch tokens, prints their names and serialized content, and consumes each token until the end-of-file token is reached. + +## Notes + +- **Buffer Size**: The buffer size (`BUFFER_SIZE`) is set to 32 to demonstrate handling small chunks of data. This size can be adjusted based on specific needs. +- **Error Handling**: The example includes basic error handling, with appropriate messages and clean-up. +- **EOF Management**: The end-of-file is managed using `tkz->eof`, ensuring the tokenizer knows when no more data is available. -## Conclusion +## Summary -This example illustrates how to implement a simple CSS syntax tokenizer using -the `lexbor` library, allowing for parsing CSS input from stdin and outputting -token information. Anyone looking to understand or extend CSS parsing -functionality can use this code as a foundation for further development. \ No newline at end of file +This example illustrates how to tokenize CSS data read from standard input, demonstrating key aspects of using the `lexbor` library. It covers initialization, setting up a chunk callback function, handling tokens, and managing end-of-file conditions. Understanding these steps is crucial for effectively working with `lexbor` to tokenize CSS or other similar structured data inputs. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/from_file.md b/source/examples/css/syntax/tokenizer/from_file.md index 74492cc..7eda93e 100644 --- a/source/examples/css/syntax/tokenizer/from_file.md +++ b/source/examples/css/syntax/tokenizer/from_file.md @@ -1,71 +1,14 @@ -# CSS Syntax Tokenizer Example +# Parsing a CSS File with `lexbor`: Example -This article provides a detailed explanation of a CSS syntax tokenizer -implemented in the file -[lexbor/css/syntax/tokenizer/from_file.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/css/syntax/tokenizer/from_file.c). -The code serves the purpose of reading a CSS file, processing its contents to -extract tokens, and producing output that describes each token. +This article focuses on the source file `lexbor/css/syntax/tokenizer/from_file.c` and explains how to parse a CSS file using the `lexbor` library. This explanation delves into the specific functions and methods employed to tokenize and handle CSS content, illustrating a practical approach to CSS parsing with `lexbor`. -## Overview +The example code demonstrates how to read a CSS file, tokenize its content using `lexbor`'s CSS syntax tokenizer, and print each recognized token. Understanding this example provides insight into the fundamental use of `lexbor` for processing CSS files, which is critical for many web development and parsing applications that require robust CSS manipulation. -The main function of the tokenizer is to parse CSS code from a file, generate -tokens for syntactic analysis, and then invoke a callback function to handle the -output of each token. The program efficiently handles input and organizes the -parsing process with the help of the `lexbor` library. +## Key Code Sections -## Code Breakdown +### Reading the CSS File -### Includes and Utility Functions - -At the beginning of the file, necessary libraries are included: - -```c -#include -#include -``` - -The first include provides access to CSS-related functionality within the lexbor -library, whereas the second includes core file system operations needed to read -the CSS file. - -A utility function `usage` is defined to provide a simple usage instruction: - -```c -static void usage(void) -{ - fprintf(stderr, "parse_file \n"); -} -``` - -This function prints an error message when the user does not provide the correct -number of arguments. - -### Main Function Logic - -The entry point of the program is the `main` function, which processes -command-line arguments and orchestrates the tokenization process: - -```c -int main(int argc, const char *argv[]) -``` - -#### Argument Validation - -At the start of the main function, the program checks whether exactly one -command-line argument (the CSS file name) has been provided: - -```c -if (argc != 2) { - usage(); - FAILED("Invalid number of arguments"); -} -``` - -If not, it calls the `usage` function and exits with an error. - -#### File Reading - -Next, the code attempts to read the specified CSS file: +The initial step in the code involves reading a CSS file. This is accomplished using `lexbor`'s file reading utility: ```c css = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &css_len); @@ -74,36 +17,36 @@ if (css == NULL) { } ``` -The `lexbor_fs_file_easy_read` function reads the entire file into memory, and -if it fails, the program reports the error and exits. +Here, `lexbor_fs_file_easy_read` takes the file path provided via command line arguments and reads its content into a dynamically allocated buffer. The length of the CSS content is stored in `css_len`. If the file reading fails, the program exits with an error. -#### Tokenizer Initialization +### Initializing the Tokenizer -The tokenizer is created and initialized: +Once the CSS content is loaded, the next step is to initialize the tokenizer: ```c tkz = lxb_css_syntax_tokenizer_create(); status = lxb_css_syntax_tokenizer_init(tkz); +if (status != LXB_STATUS_OK) { + PRINT("Failed to create CSS:Syntax parser"); + goto failed; +} ``` -These lines allocate memory for the tokenizer and perform any necessary setup. -If initialization fails, an error message is printed, and the program proceeds -to cleanup. +The tokenizer is created with `lxb_css_syntax_tokenizer_create` and initialized with `lxb_css_syntax_tokenizer_init`. If the initialization fails, the code jumps to the `failed` label to clean up resources and exit. -#### Setting Input Buffer +### Setting the Buffer and Tokenizing -Next, the contents of the CSS file are set as the input buffer for the -tokenizer: +After initializing the tokenizer, the CSS content is set as the buffer for the tokenizer: ```c lxb_css_syntax_tokenizer_buffer_set(tkz, css, css_len); ``` -This prepares the tokenizer to begin processing the CSS data. +This function sets the internal buffer of the tokenizer to the CSS data, preparing it for tokenization. -### Tokenization Loop +### Processing Tokens -The program enters a loop to process the tokens extracted from the input buffer: +The core of the tokenization process involves a loop that retrieves and processes each token: ```c do { @@ -115,45 +58,42 @@ do { name = lxb_css_syntax_token_type_name_by_id(token->type); printf("%s: ", (const char *) name); - + lxb_css_syntax_token_serialize(token, callback, NULL); printf("\n"); type = lxb_css_syntax_token_type(token); - + lxb_css_syntax_token_consume(tkz); } while (type != LXB_CSS_SYNTAX_TOKEN__EOF); ``` -#### Token Extraction +In this loop: +- `lxb_css_syntax_token` retrieves the next token from the tokenizer. +- `lxb_css_syntax_token_type_name_by_id` gets the token's type name. +- `lxb_css_syntax_token_serialize` outputs the token's content using a callback function. +- `lxb_css_syntax_token_consume` advances the tokenizer to the next token. -Within the loop, the function `lxb_css_syntax_token` retrieves a token. If no -token is available, it reports a parsing failure. Upon successful token -retrieval, it prints the type name of the token followed by calling -`lxb_css_syntax_token_serialize`, which uses the provided `callback` function to -output the token data. +The loop continues until the end-of-file (EOF) token is encountered. -The type of the current token is acquired to determine if the end of the file -(EOF) has been reached. If the EOF is not reached, the loop continues to consume -tokens. +### Cleaning Up -### Cleanup and Exit - -When the entire CSS file has been processed, resources are cleaned up: +Finally, once all tokens are processed, resources are cleaned up: ```c lxb_css_syntax_tokenizer_destroy(tkz); lexbor_free(css); ``` -Finally, the program returns `EXIT_SUCCESS` if the execution was successful, or -`EXIT_FAILURE` in case of any errors during the process. +This ensures that allocated memory is properly freed. + +## Notes + +- `lexbor_fs_file_easy_read` simplifies file reading but requires proper error handling. +- Proper initialization and cleanup of the tokenizer are crucial to avoid memory leaks. +- The tokenization loop processes each token and prints its type and content. -## Conclusion +## Summary -The CSS syntax tokenizer effectively reads and parses a CSS file, extracting and -displaying token details by utilizing the `lexbor` library's API for CSS -processing. This example demonstrates not only the functionality of lexer-based -parsing but also highlights memory management and error handling within a -complex system. \ No newline at end of file +This example illustrates how to use `lexbor` to read and tokenize CSS files. It covers essential functions for file reading, tokenizer initialization, and token processing. Understanding these steps is fundamental for developers looking to integrate CSS parsing capabilities into their applications using `lexbor`. \ No newline at end of file diff --git a/source/examples/css/syntax/tokenizer/print_raw.md b/source/examples/css/syntax/tokenizer/print_raw.md index ebcc272..72660bf 100644 --- a/source/examples/css/syntax/tokenizer/print_raw.md +++ b/source/examples/css/syntax/tokenizer/print_raw.md @@ -1,17 +1,12 @@ -# CSS Syntax Tokenizer Example +# CSS Tokenizer Printing: Example -This article provides an overview of the `print_raw.c` source file, which -implements a simple command-line tool for tokenizing CSS syntax using the `lexbor` -library. The primary purpose of this code is to read a CSS file, tokenize its -contents, and print the tokens to the standard output. +This article explains the source code example found in `lexbor/css/syntax/tokenizer/print_raw.c`. This example demonstrates how to utilize the `lexbor` library to parse a CSS file and print the raw tokens produced by the tokenizer. We'll delve into the key code sections to better understand the parsing process and token management with `lexbor`. -## Breakdown of Major Code Sections +## Key Code Sections ### Usage Function -The `usage` function is defined to inform users about how to execute the program -properly. It outputs a simple message stating that the tool requires one -argument, which is the name of the file to process: +The `usage` function provides a simple command-line usage description. It's designed to inform the user about the proper way to run the program. ```c static void @@ -21,71 +16,77 @@ usage(void) } ``` -This function is called when the number of command line arguments (`argv`) -provided is incorrect. It helps to guide users in using the tool correctly. +This function prints the correct command-line format to `stderr`. It's invoked when the user provides incorrect arguments. -### Main Function Logic +### Colorize Callback -The `main` function serves as the entry point of the program. It starts by -checking if the user has provided exactly one argument: +The `colorize_cb` function prints tokens to the standard output. It differentiates special cases, such as dimension tokens, and handles them appropriately. ```c -if (argc != 2) { - usage(); - FAILED("Invalid number of arguments"); -} -``` +void +colorize_cb(lxb_css_syntax_token_t *token) +{ + int length; + lxb_css_syntax_token_base_t *base; + lxb_css_syntax_token_string_t *str; -If this condition is not met, the `usage` function is invoked to display the -correct usage. The `FAILED` macro indicates an error state, although its -definition is not shown in this excerpt. + base = lxb_css_syntax_token_base(token); + length = (int) base->length; -### Reading the CSS File + printf("%.*s", length, base->begin); -The next step involves reading the CSS file specified by the user. The function -`lexbor_fs_file_easy_read` attempts to read the file into memory: + if (token->type == LXB_CSS_SYNTAX_TOKEN_DIMENSION) { + str = lxb_css_syntax_token_dimension_string(token); -```c -css = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &css_len); -if (css == NULL) { - FAILED("Failed to read CSS file"); + /* Ident */ + length = (int) str->base.length; + + printf("%.*s", length, str->base.begin); + } } ``` -If the reading process fails, the program terminates by invoking the `FAILED` -macro once again to report the issue. +This function extracts the base token details and prints them. If the token is of type `LXB_CSS_SYNTAX_TOKEN_DIMENSION`, it also prints the dimension string. -### Tokenization Process +### Main Function -The tokenization process begins with the creation of a tokenizer instance: +The `main` function orchestrates the overall process of reading the file, initializing the tokenizer, and processing CSS tokens. ```c -tkz = lxb_css_syntax_tokenizer_create(); -status = lxb_css_syntax_tokenizer_init(tkz); -``` +int +main(int argc, const char *argv[]) +{ + lxb_status_t status; + lxb_css_syntax_token_t *token; + lxb_css_syntax_tokenizer_t *tkz; + lxb_css_syntax_token_type_t type; + lxb_char_t *css; + size_t css_len; + + if (argc != 2) { + usage(); + FAILED("Invalid number of arguments"); + } -After creating the tokenizer, it is initialized with the -`lxb_css_syntax_tokenizer_init` function. If the initialization does not -succeed, an error message is printed, and the program enters the cleanup phase. + css = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &css_len); + if (css == NULL) { + FAILED("Failed to read CSS file"); + } -The following block of code sets the tokenizer's buffer to contain the CSS -content read from the file: + tkz = lxb_css_syntax_tokenizer_create(); + status = lxb_css_syntax_tokenizer_init(tkz); + if (status != LXB_STATUS_OK) { + PRINT("Failed to create CSS:Syntax parser"); + goto failed; + } +``` + +This block checks the command-line arguments and reads the content of the specified CSS file. If successful, it creates and initializes the CSS tokenizer. ```c tkz->with_comment = true; - lxb_css_syntax_tokenizer_buffer_set(tkz, css, css_len); -``` - -The `with_comment` flag indicates whether comments should be included in the -tokenization process. - -### Processing Tokens - -The main loop of the `main` function processes the tokens generated by the -tokenizer: -```c do { token = lxb_css_syntax_token(tkz); if (token == NULL) { @@ -98,33 +99,39 @@ do { type = lxb_css_syntax_token_type(token); lxb_css_syntax_token_consume(tkz); -} -while (type != LXB_CSS_SYNTAX_TOKEN__EOF); +} while (type != LXB_CSS_SYNTAX_TOKEN__EOF); + +lxb_css_syntax_tokenizer_destroy(tkz); +lexbor_free(css); + +printf("\n"); + +return EXIT_SUCCESS; ``` -Within this loop, a token is fetched, and if it cannot be retrieved, an error -message is printed. The `colorize_cb` function is called to handle the output -for each token. After processing the token, its type is checked, and it is -consumed for the next iteration. +The tokenizing loop handles each token produced by the tokenizer. Each token is processed by the `colorize_cb` function and then consumed. The loop continues until an EOF token is encountered. -### Cleanup Phase +### Clean-Up and Error Handling -After all tokens have been processed, the program cleans up by destroying the -tokenizer instance and freeing any allocated memory: +If any step in the process fails, the resources are properly released, and an error code is returned. ```c +failed: + lxb_css_syntax_tokenizer_destroy(tkz); lexbor_free(css); + +return EXIT_FAILURE; ``` -Finally, if no errors occurred during processing, the program returns -`EXIT_SUCCESS`. In case of failure, it follows a similar cleanup procedure but -returns `EXIT_FAILURE`. +This block ensures that the tokenizer and memory allocated for the CSS content are freed even if an error occurs. + +## Notes + +1. **Token Consumption**: The `lxb_css_syntax_token_consume` function advances the tokenizer to the next token. +2. **Dimension Tokens**: The example specially handles `LXB_CSS_SYNTAX_TOKEN_DIMENSION`, indicating the handling of composite tokens. +3. **Error Handling**: Proper clean-up routines ensure that resources are freed in both success and failure cases. -## Conclusion +## Summary -The `print_raw.c` implementation demonstrates how to leverage the `lexbor` library -for CSS syntax tokenization. By following a structured approach, it effectively -reads CSS content, processes it into tokens, and provides robust error handling. -This example serves as a foundation for further exploration of CSS parsing and -analysis using `lexbor`. \ No newline at end of file +This example effectively demonstrates how to use `lexbor` to tokenize and print CSS tokens. It highlights crucial aspects such as correct tokenizer initialization, token handling, and the importance of resource management. Understanding this pattern is essential for developers dealing with CSS parsing or similar tasks using the `lexbor` library. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decode.md b/source/examples/encoding/buffer/decode/decode.md index d572ce6..9e8998a 100644 --- a/source/examples/encoding/buffer/decode/decode.md +++ b/source/examples/encoding/buffer/decode/decode.md @@ -1,76 +1,64 @@ -# UTF-8 Decoding Example +# Decoding UTF-8 Strings to Code Points with `lexbor`: Example +The code example in `lexbor/encoding/buffer/decode/decode.c` demonstrates how to decode a UTF-8 encoded string into individual Unicode code points using the `lexbor` library. The example illustrates initialization, decoding, and extracting code points using various `lexbor` functions and data types. -In this article, we will explore a code example from the file -[lexbor/encoding/buffer/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decode.c) -that demonstrates how to decode a UTF-8 encoded string into code points using -the `lexbor` library. This example specifically highlights the usage of `lexbor`'s -encoding functionalities, providing insights into how to leverage these features -for character decoding in C. +## Key Code Sections -## Code Explanation +### Initialization and Buffer Preparation -The code begins by including the necessary header files. It specifically -includes `lexbor/encoding/encoding.h`, which contains the declarations needed -for encoding and decoding operations. The definition of the `FAILED` macro is -also provided, which facilitates error handling by printing an error message to -`stderr` and terminating the program if an error occurs. +```c +const lxb_char_t *data = (const lxb_char_t *) "Привет, мир!"; +const lxb_char_t *end = data + strlen((char *) data); +``` -### Main Function +Here, a UTF-8 encoded string `"Привет, мир!"` is defined and its length is calculated. These will be utilized later during the decoding process. -The `main` function serves as the entry point of our program, where we will set -up the decoding of a UTF-8 encoded string. +### Initializing the Decoder -#### Variable Declarations +```c +const lxb_encoding_data_t *encoding; +lxb_status_t status; +lxb_codepoint_t cp[32]; +lxb_encoding_decode_t decode; -Within the `main` function, several important variables are declared: +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); -- `buf_length`: To store the length of the decoded buffer. -- `status`: To hold the status of operations, indicated by the `lxb_status_t` - type. -- `cp`: An array of `lxb_codepoint_t` to hold the decoded code points. -- `decode`: An instance of `lxb_encoding_decode_t`, which manages the decoding - process. -- `encoding`: A pointer to the encoding data. +status = lxb_encoding_decode_init(&decode, encoding, cp, + sizeof(cp) / sizeof(lxb_codepoint_t)); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialization decoder"); +} +``` -Next, we prepare the buffer that contains the UTF-8 string "Привет, мир!" (which -translates to "Hello, World!"). The buffer is defined as `data`, and `end` is -set to point to the end of the string using `strlen`. +In this section, the UTF-8 encoding data structure is obtained with `lxb_encoding_data(LXB_ENCODING_UTF_8)`. The decoder is then initialized via `lxb_encoding_decode_init`, where `decode` is the decoder context, `encoding` provides encoding information, and `cp` is an array to store the decoded code points. The size of this array is specified in terms of the number of `lxb_codepoint_t` elements it can hold. -#### Initialization +### Performing the Decoding -The initialization process is crucial for setting up the decoder. We call -`lxb_encoding_data(LXB_ENCODING_UTF_8)` to get the encoding data for UTF-8. -Then, we initialize the decoder using `lxb_encoding_decode_init`, passing the -decoder instance, encoding, the code point array, and its capacity. +```c +status = encoding->decode(&decode, &data, end); +if (status != LXB_STATUS_OK) { + // In this example, this cannot happen. +} +``` -If this initialization fails, the `FAILED` macro is triggered, notifying us with -an error message and stopping the program. +The actual decoding process occurs with `encoding->decode(&decode, &data, end)`, taking the initialized decoder and the data buffer into account. The `data` pointer is updated during the procedure and moves towards `end`. It’s worth noting that usual error handling is omitted here, under the assumption that decoding will succeed. -#### Decoding Process +### Printing the Decoded Code Points -After successful initialization, we print the original UTF-8 string to the -console. The actual decoding is carried out by calling the `decode` function -through the `encoding` pointer. The function decodes the string pointed to by -`data` up to its `end`, storing the results in the `cp` array. +```c +size_t buf_length = lxb_encoding_decode_buf_used(&decode); -In this context, an error during decoding is not expected. Therefore, the code -contains a comment indicating that such a situation cannot occur in this -example, underlining the robustness of the decoding function for the given -input. +for (size_t i = 0; i < buf_length; i++) { + printf("0x%04X\n", cp[i]); +} +``` -#### Output and Conclusion +Finally, the number of used buffer entries (`buf_length`) is obtained using `lxb_encoding_decode_buf_used(&decode)`. A loop then iterates through the decoded code points within `cp[]`, printing each as a hexadecimal value (`0x%04X`), conforms to the Unicode code points of the original UTF-8 string. -Finally, we calculate the length of the buffer used in the decoding process with -`lxb_encoding_decode_buf_used(&decode)` and print each decoded code point in -hexadecimal format. +## Notes -The program concludes with a return statement indicating successful execution. +- **Error Handling**: The macro `FAILED(...)` is used for error handling, terminating the program with a corresponding message and `EXIT_FAILURE`. This ensures immediate notification of initialization failures. +- **Buffer Management**: The `cp[]` array size is set to 32, meant for handling individual code points and providing enough space for decoding without buffer overflow. +- **Assumptions**: The example assumes a successful decoding process, omitting error handling for the decoding step itself. ## Summary - -This example effectively illustrates how to decode a UTF-8 string into -individual code points using the `lexbor` library. It emphasizes the -initialization of the decoding context, error handling strategies, and the -process of translating encoded UTF-8 data into usable character representations. -Through careful management of buffers and decoding functions, developers can -build robust applications that accurately handle multi-byte character sets. \ No newline at end of file +This example illustrates a foundational aspect of working with `lexbor`: converting a UTF-8 encoded string to Unicode code points. By understanding how to initialize the decoder, handle buffer management, and perform the decoding process, developers can leverage `lexbor` for advanced text processing tasks. This underscores `lexbor`’s utility in dealing with various encodings efficiently and robustly. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/decoder.md b/source/examples/encoding/buffer/decode/decoder.md index bfbf69b..e70fff2 100644 --- a/source/examples/encoding/buffer/decode/decoder.md +++ b/source/examples/encoding/buffer/decode/decoder.md @@ -1,84 +1,121 @@ -# Unicode Decoder Example +# Lexbor Encoding Decoder: Example -In this article, we will discuss a simple Unicode decoder implemented in C, -specifically within the context of the `lexbor` library. The code can be found in -the source file -[lexbor/encoding/buffer/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/decoder.c). -This program is designed to take a specified character encoding from the command -line, read input data, and decode it into Unicode code points, displaying the -result in a format suitable for further processing or representation. +This article delves into the purpose and functionality of the code from the file `lexbor/encoding/buffer/decode/decoder.c`. The example demonstrates how to utilize the `lexbor` library to decode text from various encodings, converting it to Unicode code points. We'll explore key sections of the code to understand how it achieves this. -## Code Structure Overview +## Key Code Sections -The code begins with the necessary includes, defines, and utility functions -required for the decoder's operation. Key components include error handling, -usage instructions, and the main decoding loop. +### Initialization and Argument Handling + +The code begins by checking command-line arguments to ensure an encoding name is provided and initializing necessary components. -### Error Handling Macro +```c +if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); +} -The `FAILED` macro is defined to streamline error reporting throughout the code. -It takes a boolean indicating if usage should be displayed, followed by a -formatted message. If an error occurs, this macro outputs the error message to -standard error and, if requested, invokes the `usage()` function to display -acceptable encoding options. +/* Determine encoding from first argument from command line */ +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); +if (encoding == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n\n", argv[1]); +} +``` -### Usage Function +Here, `argc` is checked to guarantee exactly one argument is provided. The `usage()` function outputs how to use the program if the condition isn't met. The `lxb_encoding_data_by_pre_name()` function fetches encoding data based on the provided encoding name. If the encoding cannot be found, `FAILED()` is called to print an error and exit. -The `usage` function is a simple utility that displays how the program should be -invoked and lists the character encodings that the decoder supports. This -function becomes crucial when the user fails to provide the expected arguments. +### Decoder Initialization -### Main Function Logic +Next, the decoder is initialized with the specified encoding and a buffer for storing code points: -The `main` function serves as the entry point of the application. It handles -argument parsing, encoding determination, and the initialization of the decoding -process. +```c +status = lxb_encoding_decode_init(&decode, encoding, cp, sizeof(cp) / sizeof(lxb_codepoint_t)); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to initialization decoder"); +} -#### Argument Parsing +status = lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to set replacement code points for decoder"); +} +``` -The program checks if exactly one argument (the encoding name) has been -provided. If not, it calls the `usage()` function and exits gracefully. +The `lxb_encoding_decode_init()` function initializes the decoder, and `lxb_encoding_decode_replace_set()` sets replacement code points to handle invalid sequences during decoding. Both functions return a status code that must be checked to prevent further errors. -#### Encoding Retrieval +### Reading and Decoding Input -Next, it uses the `lxb_encoding_data_by_pre_name` function to retrieve the -encoding data based on the provided encoding name. If the encoding cannot be -determined, the `FAILED` macro is invoked with appropriate error handling. +The core of the example is a loop that reads from `stdin` and decodes the data: -#### Decoder Initialization +```c +do { + /* Read standard input */ + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } else { + FAILED(false, "Failed to read stdin"); + } + } -Once the encoding is acquired, the decoder is initialized using -`lxb_encoding_decode_init`. It also sets up a buffer for any replacement -characters that may need to be utilized during the decoding process. Each -initialization step includes error checking to ensure the decoder is prepared -for processing the input data. + /* Decode incoming data */ + data = (const lxb_char_t *) inbuf; + end = data + read_size; -### Decoding Loop + do { + status = encoding->decode(&decode, &data, end); -The main decoding operation occurs within a loop that reads data from standard -input. The program continuously reads chunks of data into a buffer (`inbuf`) -until the end of the input is reached. + buf_length = lxb_encoding_decode_buf_used(&decode); -#### Buffer Processing + for (size_t i = 0; i < buf_length; i++) { + if (cp[i] >= 0x00A0) { + /* Code point is Unicode */ + printf("\\u%04X", cp[i]); + } else { + /* Code point is ASCII */ + printf("\\x%02X", cp[i]); + } + } -For each chunk of data read, the program decodes the input using the encoding's -decode function. It iterates over the decoded results, determining whether each -code point is an ASCII character or a Unicode character. The output format uses -a hexadecimal representation for both types of characters, with Unicode points -prefixed by `\u` and ASCII points by `\x`. + lxb_encoding_decode_buf_used_set(&decode, 0); + } while (status == LXB_STATUS_SMALL_BUFFER); +} while (loop); +``` -#### Finalizing Decoding +This section reads input into `inbuf` and updates the decoder with `encoding->decode()`. It processes the buffer in chunks, printing converted code points as either Unicode or ASCII, depending on their values. The `lxb_encoding_decode_buf_used()` function returns the number of decoded code points, and this information is used to print the decoded values. -After all input data has been processed, the decoder's `finish` function is -called. This function ensures that any remaining code points, particularly those -that could not be fully processed, are correctly handled. The remaining code -points are then printed if any exist in the output buffer. +### Finishing the Decoding Process -## Conclusion +Finally, after all input has been processed, the decoder flushes any remaining code points: -This `decoder.c` example illustrates the practical use of the `lexbor` library for -handling various character encodings and converting them into a clear, usable -form. By leveraging the available utility functions and error handling methods, -the code provides a robust framework for decoding inputs in a specified -encoding, making it valuable for any application that requires processing text -in diverse formats. \ No newline at end of file +```c +(void) lxb_encoding_decode_finish(&decode); + +/* + * We need to check the out buffer after calling the finish function. + * If there was not enough data to form a code point, then the finish + * function will add the replacement character to the out buffer. + */ +buf_length = lxb_encoding_decode_buf_used(&decode); + +if (buf_length != 0) { + for (size_t i = 0; i < buf_length; i++) { + if (cp[i] >= 0x00A0) { + printf("\\u%04X", cp[i]); + } else { + printf("\\x%02X", cp[i]); + } + } +} +``` + +The `lxb_encoding_decode_finish()` function ensures all data is processed, adding replacement characters if necessary. The remaining code points are then printed similarly to the earlier steps. + +## Notes + +- **Error Handling**: The use of the `FAILED()` macro ensures graceful termination upon encountering errors. +- **Encoding Support**: The `usage()` function lists the supported encodings that the program can handle. +- **Buffer Management**: Adequate handling of input and decoding buffers is critical for managing memory and ensuring correct decoding. + +## Summary + +This example demonstrates how to use the `lexbor` library for decoding text from various encodings to Unicode code points. Key aspects include initializing the decoder, reading input in manageable chunks, handling errors gracefully, and ensuring all data is processed. Understanding this example is valuable for leveraging `lexbor` in applications requiring robust text encoding handling. \ No newline at end of file diff --git a/source/examples/encoding/buffer/decode/validate.md b/source/examples/encoding/buffer/decode/validate.md index effd71b..3b641fe 100644 --- a/source/examples/encoding/buffer/decode/validate.md +++ b/source/examples/encoding/buffer/decode/validate.md @@ -1,76 +1,40 @@ -# UTF-8 Decoding and Replacement Example +# Validating and Replacing Invalid UTF-8 Encodings: Example -This article will explain a C code example that demonstrates UTF-8 decoding and -the handling of invalid byte sequences using the `lexbor` library. The source file -for the example is -[lexbor/encoding/buffer/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/decode/validate.c). +This article explains the example file `lexbor/encoding/buffer/decode/validate.c` +which demonstrates how to decode a UTF-8 encoded string and handle invalid byte +sequences by replacing them with a specific replacement sequence using the `lexbor` library. -## Overview +The purpose of the example is to +show how to initialize a decoder, set replacement sequences for invalid byte +sequences, and decode a UTF-8 string, handling errors gracefully. This example +is useful to those needing to ensure robust UTF-8 decoding in their applications. -The provided code illustrates how to initialize a decoder for UTF-8 encoded -strings and replace any invalid byte sequences with specified replacement code -points. This is accomplished utilizing the lexbor encoding API. +## Key Code Sections -## Code Breakdown +### Initialization of Encoding Data -### Including Necessary Headers - -At the start of the code, the relevant header file from the `lexbor` library is -included: +In the first significant part of the code, we initialize the `lexbor` encoding +data for UTF-8: ```c -#include -``` - -This inclusion is necessary as it provides the required declarations and -definitions for encoding operations performed later in the code. - -### Defining a Macro for Error Handling +const lxb_encoding_data_t *encoding; -A macro named `FAILED` is defined to handle errors gracefully: - -```c -#define FAILED(...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - \ - exit(EXIT_FAILURE); \ - } \ - while (0) +/* Initialize for UTF-8 encoding */ +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); ``` -This macro uses `fprintf` to print error messages to standard error and then -exits the program with `EXIT_FAILURE`. It helps streamline error reporting -throughout the code. +This uses the `lxb_encoding_data` function to obtain a pointer to the encoding +data for UTF-8, as specified by the constant `LXB_ENCODING_UTF_8`. -### Main Function and Buffer Preparation +### Decoder Initialization -The main function initializes several variables, including a buffer for decoded -code points and an instance of the decoder: +We then proceed with initializing the decoder by using `lxb_encoding_decode_init`: ```c -int main(int argc, const char *argv[]) { - size_t buf_length; - lxb_status_t status; - lxb_codepoint_t cp[32]; - lxb_encoding_decode_t decode; - const lxb_encoding_data_t *encoding; - - const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; - const lxb_char_t *end = data + strlen((char *) data); -``` +lxb_status_t status; +lxb_codepoint_t cp[32]; +lxb_encoding_decode_t decode; -In this segment, a buffer `cp` is defined to hold up to 32 decoded code points. -The `data` variable contains a UTF-8 string that includes an invalid byte -(`\x80`). The `end` variable calculates the pointer to the end of the `data`. - -### Initializing the Decoder - -The code initializes the decoder for UTF-8 using: - -```c -encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); status = lxb_encoding_decode_init(&decode, encoding, cp, sizeof(cp) / sizeof(lxb_codepoint_t)); if (status != LXB_STATUS_OK) { @@ -78,14 +42,14 @@ if (status != LXB_STATUS_OK) { } ``` -Here, `lxb_encoding_data` retrieves the encoding data for UTF-8. The -`lxb_encoding_decode_init` function sets up the decoder with the encoding -information and the previously defined buffer for decoded code points. If -initialization fails, the `FAILED` macro is invoked. +Here, `lxb_encoding_decode_init` initializes the `decode` structure for the given +encoding and prepares it to store code points in the `cp` buffer. If this operation +fails, an error message is printed and the program exits. -### Configuring Replacement Settings +### Setting Replacement Code Points -Next, the code sets up settings for replacing invalid byte sequences: +Invalid byte sequences are handled by setting a replacement sequence with +`lxb_encoding_decode_replace_set`: ```c status = lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, @@ -95,44 +59,53 @@ if (status != LXB_STATUS_OK) { } ``` -This step allows the decoder to specify how to handle invalid sequences by using -the replacement character defined in lexbor. Again, the error handling is -consistent throughout. +By using the `LXB_ENCODING_REPLACEMENT_BUFFER` and associated length macro, +we configure the decoder to substitute invalid sequences with a predefined replacement. -### Decoding the Input String +### Decoding the UTF-8 String -The actual decoding is performed with the following: +The core decoding process is performed with: ```c +const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; +const lxb_char_t *end = data + strlen((char *) data); + status = encoding->decode(&decode, &data, end); if (status != LXB_STATUS_OK) { /* In this example, this cannot happen. */ } ``` -This line invokes the decoding process, moving through the input string from -`data` to `end`. The decoder attempts to handle any valid sequences and replaces -any invalid sequences as configured earlier. +Here, `data` contains the UTF-8 string to be decoded, including an invalid byte +sequence (`\x80`). We call the `encoding->decode` function to process the string +and handle any invalid sequences using the previously set replacement. -### Outputting the Decoded Values +### Printing the Result Finally, the decoded code points are printed: ```c -buf_length = lxb_encoding_decode_buf_used(&decode); +size_t buf_length = lxb_encoding_decode_buf_used(&decode); for (size_t i = 0; i < buf_length; i++) { printf("0x%04X\n", cp[i]); } ``` -Here, `lxb_encoding_decode_buf_used` retrieves the number of valid code points -decoded. Then, a loop iterates over each code point in the buffer, printing the -hexadecimal representation. +The `lxb_encoding_decode_buf_used` function returns the number of code points +stored in the buffer, which we then iterate over, printing each as a hexadecimal value. + +## Notes + +1. The `FAILED` macro is used for error handling by printing a message and exiting. +2. The invalid byte sequence, `\x80`, is replaced using the specified replacement sequence. +3. The example demonstrates how to handle both initialization and runtime errors + gracefully. -## Conclusion +## Summary -This example effectively showcases the use of the `lexbor` library for decoding -UTF-8 strings while managing potentially invalid byte sequences. By initializing -the decoder, setting up replacement strategies, and decoding the input string, -the program demonstrates a robust method for handling encoding issues in C. \ No newline at end of file +This example showcases the proper use of the `lexbor` library for decoding UTF-8 +strings while managing invalid byte sequences. It covers data initialization, +decoder setup, and configurable error handling using replacement sequences. +Understanding this example is essential for developers needing robust UTF-8 +decoding in their lexbor-based applications. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encode.md b/source/examples/encoding/buffer/encode/encode.md index 1d41f59..1ab7529 100644 --- a/source/examples/encoding/buffer/encode/encode.md +++ b/source/examples/encoding/buffer/encode/encode.md @@ -1,122 +1,93 @@ -# Encoding Unicode Code Points to UTF-8 Example +# Encoding Unicode Code Points to UTF-8: Example -This article explains the encoding of Unicode code points to a UTF-8 byte string -using the `lexbor` library. The source code is located in -[lexbor/encoding/buffer/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/encode.c). -This example demonstrates how to initialize the encoder, encode Unicode code -points, and handle the output appropriately. +In this article, we will analyze the code example found in `lexbor/encoding/buffer/encode/encode.c`. This example demonstrates how to use the `lexbor` library to encode Unicode code points into a UTF-8 byte string. We will delve into the details of how the buffer is managed and how the `lexbor` encoding functions are utilized to achieve the desired result. -## Overview +## Key Code Sections -The primary purpose of this code is to convert an array of Unicode code points -into a UTF-8 encoded string. The code includes error handling, memory allocation -for the output buffer, and final output printing. +### Setup and Initialization -## Code Explanation - -### Includes and Macros - -The code begins with the inclusion of the `lexbor/encoding/encoding.h` header -file, which provides necessary functions and definitions for encoding -operations. A macro called `FAILED` is defined to handle error reporting: +The initial part of the code sets up the environment, prepares the buffer, and initializes the encoder. Let's take a closer look: ```c -#define FAILED(...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - \ - exit(EXIT_FAILURE); \ - } \ - while (0) -``` +lxb_status_t status; +lxb_encoding_encode_t encode; +const lxb_codepoint_t *cps_ref, *cps_end; +const lxb_encoding_data_t *encoding; -This macro simplifies the error handling by printing an error message to -`stderr` and exiting the program if there is a failure during initialization. +/* Prepare buffer */ +lxb_char_t buffer[1024]; -### Main Function +/* Unicode code points for encoding */ +lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, + 0x002C, 0x0020, 0x043C, 0x0438, 0x0440, 0x0021}; -The `main` function initializes several variables and prepares to encode the -Unicode code points: +cps_ref = cps; +cps_end = cps_ref + (sizeof(cps) / sizeof(lxb_codepoint_t)); -```c -int main(int argc, const char *argv[]) -{ - lxb_status_t status; - lxb_encoding_encode_t encode; - const lxb_codepoint_t *cps_ref, *cps_end; - const lxb_encoding_data_t *encoding; - - /* Prepare buffer */ - lxb_char_t buffer[1024]; +/* Initialization */ +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); ``` -In this section, a buffer of 1024 characters is created to hold the encoded byte -string. The `lxb_codepoint_t` array contains several predefined Unicode code -points. - -### Unicode Code Points +Here, a buffer is prepared to hold the UTF-8 encoded bytes. The array `cps` contains the Unicode code points to be encoded. The code points are initialized and assigned pointers, `cps_ref` and `cps_end`, which reference the start and end of the code points array. -The code points initialized in the `cps` array represent Cyrillic characters and -symbols: - -```c -lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, - 0x002C, 0x0020, 0x043C, 0x0438, 0x0440, 0x0021}; -``` +The `lxb_encoding_data` function is then called with `LXB_ENCODING_UTF_8` to get the encoding data for UTF-8. ### Encoder Initialization -The encoding data for UTF-8 is retrieved and the encoder is initialized with: +Next, we initialize the encoder: ```c -encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); status = lxb_encoding_encode_init(&encode, encoding, buffer, sizeof(buffer)); if (status != LXB_STATUS_OK) { - FAILED("Failed to initialization encoder"); + FAILED("Failed to initialize encoder"); } ``` -Here, `lxb_encoding_data` retrieves encoding information for UTF-8, and -`lxb_encoding_encode_init` initializes the encoding context. If the -initialization fails, the `FAILED` macro is invoked. +The `lxb_encoding_encode_init` function initializes the encoder with the specified encoding (UTF-8) and buffer. It takes as parameters the encoder object, the encoding data, the buffer, and its size. If initialization fails, the `FAILED` macro will output an error message and exit the program. -### Encoding Process +### Encoding the Code Points -Next, the code encodes the Unicode code points: +With the encoder initialized, we proceed to encode the code points: ```c +printf("Encode code points to UTF-8 byte string:\n"); + status = encoding->encode(&encode, &cps_ref, cps_end); if (status != LXB_STATUS_OK) { /* In this example, this cannot happen. */ } ``` -This line calls the `encode` function from the `encoding` structure, which -encodes the code points from `cps_ref` to `cps_end`. +Here, the `encoding->encode` function is invoked to encode the Unicode code points into the buffer as a UTF-8 string. It updates `cps_ref` to point to the next code point after the last encoded one upon completion. If encoding fails (though in this simple example it is not expected to), an error handling mechanism would be necessary. -### Output Preparation +### Finalizing and Outputting the Encoded String -After encoding, the buffer is terminated with a null character: +The following lines finalize the buffer and output the result: ```c +/* Terminate string */ buffer[ lxb_encoding_encode_buf_used(&encode) ] = 0x00; -``` -### Printing Results +/* Print result */ +cps_ref = cps; -Finally, the result is displayed: +for (; cps_ref < cps_end; cps_ref++) { + printf("0x%04X", *cps_ref); +} -```c printf("\nResult: %s\n", (char *) buffer); ``` -This prints the encoded UTF-8 string to standard output along with the original -Unicode values shown in hexadecimal format. +The string is terminated by setting the byte after the used buffer space to `0x00`. This ensures the buffer is null-terminated, making it a valid C string. + +The original code points are printed in a loop, followed by the UTF-8 encoded result, providing a clear comparison between input code points and the final output. + +## Notes + +- The example uses UTF-8 encoding, but the `lexbor` library supports various encodings. +- Error handling is minimal in this example. Production code should robustly handle potential encoding errors. +- This example highlights the flexibility and ease-of-use of the `lexbor` library for encoding purposes. -## Conclusion +## Summary -This code example effectively demonstrates the usage of the `lexbor` encoding -library for converting Unicode code points to a UTF-8 encoded string. It -emphasizes proper initialization, error handling, and output formatting, which -are essential for working with character encoding in C programming. \ No newline at end of file +This example demonstrates how to encode an array of Unicode code points into a UTF-8 byte string using the `lexbor` library. Key takeaways include initializing the encoding environment, handling the buffers correctly, and encoding the data. Understanding this process is crucial for developers looking to work with text encoding in their applications using `lexbor`. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/encoder.md b/source/examples/encoding/buffer/encode/encoder.md index f2d92e3..8259237 100644 --- a/source/examples/encoding/buffer/encode/encoder.md +++ b/source/examples/encoding/buffer/encode/encoder.md @@ -1,79 +1,12 @@ -# Encoder Example +# Encoding Data with Escaped Sequences: Example -This article provides an explanation of the `encoder.c` source file located in -the `lexbor/encoding/buffer/encode` directory. The intent of the code is to -implement a command-line utility that encodes input data based on the specified -character encoding name. The encoder processes Standard Input, converts it based -on escape sequences into code points, and outputs the encoded data to Standard -Output. +In this example, found in the file `lexbor/encoding/buffer/encode/encoder.c`, we delve into an implementation that reads input data, processes any escaped sequences, and encodes the data using the specified character encoding. The purpose of this code is to demonstrate how the `lexbor` library can be used to handle textual data with escaped sequences and convert it to various encodings. This write-up explains key parts of the program, focusing on the logic and usage of `lexbor` functions. -## Code Structure and Major Sections +## Key Code Sections -### Header and Includes +### Command Line Arguments Handling -At the beginning of the file, there are several include statements that bring in -necessary libraries: - -```c -#include -#include -#include -#include -``` - -These headers allow access to string manipulation functions, standard -input/output functionalities, and the defined encoding structures and functions -within the `lexbor` library. - -### Error Handling - -The `FAILED` macro is defined to streamline error handling within the code. It -prints an error message and usage instructions when an issue occurs: - -```c -#define FAILED(with_usage, ...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - \ - if (with_usage) { \ - usage(); \ - } \ - \ - exit(EXIT_FAILURE); \ - } \ - while (0) -``` - -This macro takes a boolean flag to determine if usage instructions should be -displayed before exiting. This ensures that any critical failures can inform -users about incorrect command usage. - -### Usage Function - -The `usage` function provides a simple guide on how to run the encoder, listing -available encodings. It helps users understand the valid options to include when -calling the program: - -```c -static void usage(void) -{ - printf("Usage: encoder \n\n"); - printf("Available encodings:\n"); - // List of encodings... -} -``` - -### Main Function - -The `main` function is the core of the program, where execution begins. It -handles command-line arguments, initializes encoding setups, reads from Standard -Input, and writes the encoded data to Standard Output. - -#### Command-Line Argument Handling - -The program expects one argument - the encoding name. If this is not provided, -the `usage` function is invoked: +The program starts with a basic check for command line arguments, where it expects exactly one argument specifying the desired encoding. ```c if (argc != 2) { @@ -82,67 +15,94 @@ if (argc != 2) { } ``` -#### Encoding Initialization +This section ensures that the user provides an encoding name, and if not, it shows usage instructions and exits. + +### Fetching and Initializing Encoding -The encoding is determined using the `lxb_encoding_data_by_pre_name` function, -which fetches the encoding data associated with the provided name. If it fails, -it reports an error: +The encoding is determined from the user-provided argument, and the encoder is initialized accordingly. ```c encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); if (encoding == NULL) { FAILED(true, "Failed to get encoding from name: %s\n", argv[1]); } + +status = lxb_encoding_encode_init(&encode, encoding, outbuf, sizeof(outbuf)); +if (status != Lxb_STATUS_OK) { + FAILED(false, "Failed to initialize encoder"); +} ``` -After determining the encoding, the encoder is initialized with -`lxb_encoding_encode_init`: +Here, `lxb_encoding_data_by_pre_name` retrieves the encoding data, and `lxb_encoding_encode_init` initializes the encoding context. + +### Setting Replacement Bytes for Encoder + +Depending on the encoding specified, replacement bytes are set. This is crucial for handling invalid or unencodable sequences. ```c -status = lxb_encoding_encode_init(&encode, encoding, outbuf, sizeof(outbuf)); +if (encoding->encoding == Lxb_ENCODING_UTF_8) { + status = lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); +} +else { + status = lxb_encoding_encode_replace_set(&encode, (lxb_char_t *) "?", 1); +} + if (status != LXB_STATUS_OK) { - FAILED(false, "Failed to initialization encoder"); + FAILED(false, "Failed to set replacement bytes for encoder"); } ``` -This sets up a buffer for output based on the specified encoding type. +UTF-8 has specific replacement bytes, while other encodings use a generic question mark. -### Data Encoding Loop +### Processing Input Data -The heart of the encoding process is found in a `do-while` loop that reads from -stdin and encodes the input data: +The program reads data from standard input in chunks of 4096 bytes, processes each chunk, and converts it into code points. ```c -do { - read_size = fread(inbuf, 1, sizeof(inbuf), stdin); - // Encoding logic... -} while (loop); +read_size = fread(inbuf, 1, sizeof(inbuf), stdin); +if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } +} + +data = (const lxb_char_t *) inbuf; +end = data + read_size; +cp_end = escaped_to_codepoint(data, end, cp, &state, &cp_rep, loop == false); ``` -If the end of the file is reached on standard input (`feof(stdin)`), the loop -breaks, indicating that no more data is available. +This part handles reading input, processes potential partial reads due to end-of-file, and calls the `escaped_to_codepoint` function to process the escaped sequences into code points. -#### Escaped Code Points Conversion +### Encoding and Output -The `escaped_to_codepoint` function handles the conversion of escape sequences -(e.g., '\x41' for 'A') into code points that can be processed. The logic checks -for valid escape sequences and builds the code points accordingly. If a broken -sequence is detected, it triggers an error: +After converting to code points, the data is encoded and written to standard output. ```c -static const lxb_codepoint_t * escaped_to_codepoint(const lxb_char_t *data, ... -if (*state != 0) { - // Handle escape sequence state... - // Process each character to build the codepoint... +do { + status = encoding->encode(&encode, &cp_ref, cp_end); + read_size = lxb_encoding_encode_buf_used(&encode); + + if (fwrite(outbuf, 1, read_size, stdout) != read_size) { + FAILED(false, "Failed to write data to stdout"); + } + + lxb_encoding_encode_buf_used_set(&encode, 0); } +while (status == LXB_STATUS_SMALL_BUFFER); ``` -### Finalizing and Outputting +This loop ensures that all data is properly encoded and outputted, even handling cases where the buffer might be too small on the first pass. + +### Finalizing Encoding -After encoding, the program finalizes the encoded output and writes any -remaining data to stdout. This is done using: +At the end of processing, the encoder is finalized to flush any remaining data. ```c +(void) lxb_encoding_encode_finish(&encode); + read_size = lxb_encoding_encode_buf_used(&encode); if (read_size != 0) { if (fwrite(outbuf, 1, read_size, stdout) != read_size) { @@ -151,14 +111,14 @@ if (read_size != 0) { } ``` -This ensures that any data that has not yet been flushed from the buffer is -written out before the program exits. +This ensures that any leftover data in the encoder’s internal buffer is written out. + +## Notes + +1. **Error Handling**: The macro `FAILED` helps in providing consistent error messages and exits on failure. +2. **Escaped Sequence Processing**: The function `escaped_to_codepoint` is crucial for converting escaped sequences like `\xNN` and `\uNNNN` into code points. +3. **Buffer Management**: Proper buffer management ensures that encoding processes can handle partial reads and writes effectively. -## Conclusion +## Summary -The `encoder.c` file is a functional implementation of an encoding utility using -the `lexbor` library. It effectively handles various character encodings, -processes input data in a loop, and provides useful output, making it a useful -tool for developers working with different text encodings. The awareness of -error handling and usage guidance further enhances its usability in command-line -environments. \ No newline at end of file +This example demonstrates how to use the `lexbor` library to handle input data with escaped sequences, converting it to the specified encoding. It showcases the critical steps for initializing encoders, processing input data, handling partial reads, and finalizing output. Understanding this example is essential for those looking to leverage `lexbor` for complex text encoding tasks in their applications. \ No newline at end of file diff --git a/source/examples/encoding/buffer/encode/validate.md b/source/examples/encoding/buffer/encode/validate.md index 5edee1c..9130df5 100644 --- a/source/examples/encoding/buffer/encode/validate.md +++ b/source/examples/encoding/buffer/encode/validate.md @@ -1,82 +1,116 @@ -# Unicode Encoding Example +# Validating Encoded Strings with `lexbor`: Example -This article explains the functionality of a Unicode encoding example, which can -be found in the source file -[lexbor/encoding/buffer/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/encode/validate.c). -The code serves as an illustration of how to encode Unicode code points into a -UTF-8 byte string using the `lexbor` library. +This article explains the functioning of a code example found in the file +`lexbor/encoding/buffer/encode/validate.c`. This code demonstrates how to use +the `lexbor` library to encode a series of Unicode code points into a UTF-8 byte +string, validating and handling invalid code points along the way. -## Overview +The example showcases how to properly initialize an encoder with +the `lexbor` library, encode a sequence of Unicode code points into a UTF-8 byte +string, and manage any invalid code points encountered in the process. The +example's intent is to demonstrate the practical use of the `lexbor` encoding +library for encoding and validating Unicode sequences. -The example demonstrates the process of setting up an encoder, preparing a -buffer for the encoded result, and ultimately encoding a series of Unicode code -points. The code also highlights error handling when initializing the encoder -and configuring it with replacement bytes for invalid code points. +## Key Code Sections -## Code Explanation +### Initialization and Buffer Preparation + +The first critical step is to initialize the encoder and prepare the buffer for +the encoded output. + +```c +lxb_encoding_encode_t encode; +const lxb_codepoint_t *cps_ref, *cps_end; +const lxb_encoding_data_t *encoding; + +/* Prepare buffer */ +lxb_char_t buffer[1024]; +``` + +Here, the `encode` structure is declared to hold the encoder state. An array of +Unicode code points (`cps`) is prepared, consisting of valid and one invalid +code point (`0x110000`). Buffer size of 1024 bytes is allocated to hold the +encoded string. + +### Setting Up the Encoder + +```c +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + +status = lxb_encoding_encode_init(&encode, encoding, buffer, sizeof(buffer)); +if (status != LXB_STATUS_OK) { + FAILED("Failed to initialize encoder"); +} + +status = lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, + LXB_ENCODING_REPLACEMENT_SIZE); +if (status != LXB_STATUS_OK) { + FAILED("Failed to set replacement bytes for encoder"); +} +``` + +The encoder is initialized with the `UTF-8` encoding and the provided buffer. +The function `lxb_encoding_encode_init` takes care of this initialization. +Additionally, `lxb_encoding_encode_replace_set` sets the replacement bytes to +handle invalid code points. This ensures that invalid entries are substituted +with a predefined replacement sequence. + +### Encoding the Unicode Code Points + +```c +cps_ref = cps; +cps_end = cps_ref + (sizeof(cps) / sizeof(lxb_codepoint_t)); + +printf("Encode code points to UTF-8 byte string:\n"); + +/* Encode */ +status = encoding->encode(&encode, &cps_ref, cps_end); +if (status != LXB_STATUS_OK) { + /* In this example, this cannot happen. */ +} +``` + +The encoder processes the Unicode code points with the `encode` function, +transforming them into a UTF-8 byte string. The output buffer will contain the +encoded byte string, and a bad code point is replaced by the replacement bytes +set previously. + +### Finalizing the Encoding and Printing the Result + +```c +buffer[ lxb_encoding_encode_buf_used(&encode) ] = 0x00; + +/* Print result */ +cps_ref = cps; + +for (; cps_ref < cps_end; cps_ref++) { + printf("0x%04X", *cps_ref); +} + +printf("\nResult: %s\n", (char *) buffer); +``` + +The encoded string is null-terminated using the `lxb_encoding_encode_buf_used` +to get the actual length of the encoded content. The original code points and +the resulting encoded string are printed to the stdout, showcasing how the +encoder dealt with the input, including the invalid code point. + +## Notes + +- **Error Handling**: Proper error handling is demonstrated with the `FAILED` + macro, ensuring that the program exits if initialization or replacement + byte setup fails. +- **Invalid Code Points**: The example shows how to handle invalid Unicode code + points gracefully by setting replacement bytes. +- **Initialization and Finalization**: Correct encoder initialization, buffer + setup, and string termination are important for ensuring the accuracy and + safety of the encoding process. + +## Summary -### Includes and Macros - -The code begins by including necessary header files, specifically `string.h` for -string manipulation and `lexbor/encoding/encoding.h` for encoding functions from -the `lexbor` library. A macro named `FAILED` is defined for error handling, which -simplifies reporting errors by outputting a message to `stderr` and exiting the -program with a failure status. - -### Main Function - -The `main` function encapsulates the entire encoding process. It starts by -declaring variables that will be used later, including an -`lxb_encoding_encode_t` structure to handle the encoding state, pointers to a -list of code points, and a buffer initialized to hold the resulting UTF-8 byte -string. - -### Code Points Preparation - -A set of Unicode code points is prepared in an array called `cps`, which -includes valid points such as Cyrillic characters, a comma, a space, and an -exclamation mark. Notably, one of the code points included is `0x110000`, which -is invalid. This serves to demonstrate how replacement strategies can be applied -when dealing with unexpected values. - -### Encoder Initialization - -The code subsequently retrieves the encoding data for UTF-8 using the -`lxb_encoding_data` function. The encoder is initialized with -`lxb_encoding_encode_init`, which requires the encoder structure, encoding data, -a buffer, and the size of that buffer. If initialization fails, the program uses -the `FAILED` macro to report the error and terminate. - -### Setting Replacement Bytes - -After successful initialization, the example configures the encoder to use -specific replacement bytes for invalid code points by invoking -`lxb_encoding_encode_replace_set`. This ensures that when an invalid code point -is encountered during the encoding process, a predetermined sequence of bytes -will replace it. - -### Encoding Process - -A message is printed to indicate the start of the encoding process. The actual -encoding is performed using the `encode` function pointer from the encoding -data, which takes the encoder structure and a range defined by pointers to the -beginning and end of the code points. - -If the encoding state indicates an error, it will be silently ignored here since -it should not occur in this example. After encoding, the buffer is appropriately -terminated with a null byte to signify the end of the string. - -### Output - -Finally, the code loops through the original code points, printing each as a -hexadecimal value to the console. It then outputs the resulting UTF-8 string -stored in the buffer, demonstrating the successful encoding of the input code -points. - -## Conclusion - -This example showcases how to utilize the `lexbor` library to encode Unicode code -points into a UTF-8 byte string while implementing error handling and -customization through replacement bytes for invalid code points. By following -the steps outlined, developers can efficiently manage Unicode data in their -applications. \ No newline at end of file +This example demonstrates fundamental techniques in using the `lexbor` encoding +library for converting Unicode code points to a UTF-8 byte string. It emphasizes +error handling, the importance of setting replacement bytes for invalid code +points, and proper buffer management. Understanding these concepts is crucial +for developers working with Unicode text processing and encoding using the +`lexbor` library. \ No newline at end of file diff --git a/source/examples/encoding/buffer/from_to.md b/source/examples/encoding/buffer/from_to.md index aa49384..7cc34ec 100644 --- a/source/examples/encoding/buffer/from_to.md +++ b/source/examples/encoding/buffer/from_to.md @@ -1,87 +1,172 @@ -# Encoding Conversion Example - -This article describes an example of encoding conversion using the `from_to` -program from the `lexbor` library, specifically found in the source file -[lexbor/encoding/buffer/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/buffer/from_to.c). -The program reads data from the standard input, converts the data from one -encoding to another (specified by the user), and outputs the result to the -standard output. - -## Overview - -The main function of the program is to facilitate the conversion of text between -various character encodings. This operation is critical in environments where -data needs to be interpreted correctly across different platforms or -applications that utilize specific character encoding schemes. The program -checks the validity of input encodings, performs the decode and encode -operations, and handles errors appropriately. - -### Major Components - -1. **Macro Definition for Error Handling** - A macro named `FAILED` is defined to centralize error handling within the - program. It takes a flag (`with_usage`) to determine if usage instructions - should be displayed, outputs an error message to `stderr`, and exits the - program. This reduces redundancy in error handling and improves code - maintainability. - - ```c - #define FAILED(with_usage, ...) \ - ``` - -2. **Usage Function** - The `usage` function prints out how to use the program along with available - encoding names. If the required number of arguments is not provided - (specifically two arguments for 'from' and 'to'), this function will be - invoked to guide the user. - - ```c - static void usage(void) {...} - ``` - -3. **Main Function Logic** - The `main` function is where the primary execution occurs. It begins by - checking command-line arguments to ensure the user has provided the necessary - inputs. The program uses `lxb_encoding_data_by_pre_name` to retrieve encoding - information based on user input, and if either input is invalid, it calls the - `FAILED` macro. - -4. **Initialization of Encoder and Decoder** - Both the encoder and decoder are initialized with their respective encoding - data. The decoder will convert input bytes into code points (abstract - character representations), while the encoder converts these code points back - into byte sequences of the target encoding. - - ```c - status = lxb_encoding_decode_init(&decode, from, cp, sizeof(cp) / sizeof(lxb_codepoint_t)); - ``` - -5. **Processing Input Data** - The program reads data from `stdin` in a loop until all input is processed. - The decode operation converts the input byte sequence into code points, which - are then passed to the encoder to convert into the target encoding. The - `fwrite` function is employed to write the output to `stdout`. - - ```c - size = fread(inbuf, 1, sizeof(inbuf), stdin); - ``` - -6. **Finalization** - After all input has been processed, the program ensures that any remaining - decoded data is encoded and written to the output. Special care is taken for - the `iso-2022-jp` encoding, which may require specific handling to finalize - the conversion. - - ```c - (void) lxb_encoding_encode_finish(&encode); - ``` - -## Conclusion - -The `from_to` example illustrates how to adeptly handle encoding conversions in -C using the `lexbor` library. By providing a structured way to manage different -encodings and offering clear error handling, this example serves as a -foundational component in the development of applications that require text data -manipulation across various encodings. The modular approach allows enhancements -to be easily integrated, such as supporting additional encodings or modifying -the input/output methods. \ No newline at end of file +# Character Encoding Conversion: Example + +This document explains the `lexbor/encoding/buffer/from_to.c` file in the `lexbor` library, which demonstrates how to read input data, decode it using one encoding, and then encode it with another encoding. This example highlights core functionalities of lexbor's encoding module. + +## Key Code Sections + +### Encoding Data Initialization + +The program starts by verifying the command-line arguments and retrieving the corresponding encoding data for the given `from` and `to` encodings. The `lxb_encoding_data_by_pre_name` function retrieves the encoding data by its name. + +```c +if (argc != 3) { + usage(); + exit(EXIT_SUCCESS); +} + +/* Get encoding data for 'from' */ +from = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], + strlen(argv[1])); +if (from == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n", argv[1]); +} + +/* Get encoding data for 'to' */ +to = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[2], + strlen(argv[2])); +if (to == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n", argv[2]); +} +``` + +The `from` and `to` variables store the encoding data retrieved based on the user's input. If the encoding names provided are invalid, the program exits with an error message. + +### Decoder and Encoder Initialization + +Next, the code initializes the decode and encode contexts using the retrieved encoding data. + +```c +/* Initialization decode */ +status = lxb_encoding_decode_init(&decode, from, cp, + sizeof(cp) / sizeof(lxb_codepoint_t)); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to initialization decoder"); +} + +status = lxb_encoding_decode_replace_set(&decode, + LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to set replacement code point for decoder"); +} + +/* Initialization encode */ +status = lxb_encoding_encode_init(&encode, to, outbuf, sizeof(outbuf)); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to initialization encoder"); +} + +if (to->encoding == LXB_ENCODING_UTF_8) { + status = lxb_encoding_encode_replace_set(&encode, + LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); +} +else { + status = lxb_encoding_encode_replace_set(&encode, (lxb_char_t *) "?", 1); +} + +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to set replacement bytes for encoder"); +} +``` + +The `lxb_encoding_decode_init` and `lxb_encoding_encode_init` functions initialize the decoder and encoder contexts, respectively. The replacements are set to handle invalid sequences during decoding and encoding. + +### Data Decoding and Encoding Loop + +The core of the program reads data from standard input, decodes it, and then encodes the resulting code points using the specified encoding. + +```c +do { + /* Read standard input */ + size = fread(inbuf, 1, sizeof(inbuf), stdin); + if (size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } + } + + /* Decode incoming data */ + data = (const lxb_char_t *) inbuf; + end = data + size; + + do { + /* Decode */ + decode_status = from->decode(&decode, &data, end); + + cp_ref = cp; + cp_end = cp + lxb_encoding_decode_buf_used(&decode); + + do { + encode_status = to->encode(&encode, &cp_ref, cp_end); + if (encode_status == LXB_STATUS_ERROR) { + cp_ref++; + encode_status = LXB_STATUS_SMALL_BUFFER; + } + + size = lxb_encoding_encode_buf_used(&encode); + + /* The printf function cannot print \x00, it can be in UTF-16 */ + if (fwrite(outbuf, 1, size, stdout) != size) { + FAILED(false, "Failed to write data to stdout"); + } + + lxb_encoding_encode_buf_used_set(&encode, 0); + } + while (encode_status == LXB_STATUS_SMALL_BUFFER); + + lxb_encoding_decode_buf_used_set(&decode, 0); + } + while (decode_status == LXB_STATUS_SMALL_BUFFER); +} +while (loop); +``` + +This segment reads the input in chunks, decodes each chunk, and encodes the result. The loop handles the possibility that the buffers might be too small to hold the decoded or encoded data entirely at once. + +### Finalization of Decoding and Encoding + +After the input is fully processed, the program finalizes the decoding and encoding operations to ensure all data is correctly handled. + +```c +/* End of file */ +/* In this moment encoder and decoder out buffer is empty */ + +/* First: finish decoding */ +(void) lxb_encoding_decode_finish(&decode); + +if (lxb_encoding_decode_buf_used(&decode)) { + cp_ref = cp; + cp_end = cp + lxb_encoding_decode_buf_used(&decode); + + (void) to->encode(&encode, &cp_ref, cp_end); + size = lxb_encoding_encode_buf_used(&encode); + + if (fwrite(outbuf, 1, size, stdout) != size) { + FAILED(false, "Failed to write data to stdout"); + } +} + +/* Second: finish encoding */ +(void) lxb_encoding_encode_finish(&encode); +size = lxb_encoding_encode_buf_used(&encode); + +if (size != 0) { + if (fwrite(outbuf, 1, size, stdout) != size) { + FAILED(false, "Failed to write data to stdout"); + } +} +``` + +The `lxb_encoding_decode_finish` and `lxb_encoding_encode_finish` functions ensure that any remaining data in the buffers is processed and outputted. + +## Notes + +- It is crucial to handle buffer sizes and potential overflows carefully to avoid data loss. +- Setting replacement characters or byte sequences helps manage invalid encoding sequences gracefully. +- Properly finalizing decoding and encoding processes ensures that all input data is correctly processed. + +## Summary + +This example illustrates how to use the `lexbor` library to convert data between different character encodings. It handles reading from standard input, decoding using one encoding, and then encoding to another, while managing buffer sizes and invalid sequences. Understanding this code helps users leverage lexbor's powerful encoding functionalities in their own applications. \ No newline at end of file diff --git a/source/examples/encoding/data_by_name.md b/source/examples/encoding/data_by_name.md index 5ff656b..25b3441 100644 --- a/source/examples/encoding/data_by_name.md +++ b/source/examples/encoding/data_by_name.md @@ -1,52 +1,46 @@ -# Encoding Data Retrieval Example +# Retrieve Encoding Data by Name: Example -This article provides an explanation of an example from the file -[lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c). -The purpose of this code is to demonstrate how to retrieve encoding data by its -name using the `lexbor` encoding library. The code illustrated here highlights the -procedure for accessing character encoding information, specifically focusing on -UTF-8. +This example demonstrates how to retrieve encoding data by name using the `lexbor` +library, as shown in the file `lexbor/encoding/data_by_name.c`. This code +illustrates the utilization of `lexbor` functions and data types to find specific +character encoding details based on a given encoding name. -## Code Explanation +The purpose of the example is to demonstrate +how to use the `lexbor` library to query character encoding information by +providing an encoding name. This example is helpful for understanding how to +interact with the encoding module of `lexbor`, which is crucial for various +tasks such as text processing, web scraping, or any application requiring +character set conversions. -The program starts with the necessary `#include` directive, which includes the -`lexbor` encoding library header file. This library provides the functionality -needed to work with different character encodings. +## Key Code Sections -### Main Function +### Finding Encoding Data by Name -The `main` function serves as the entry point of the program: +The main functionality of this example is encapsulated in the following lines: ```c -int main(int argc, const char *argv[]) -``` - -Here, it accepts two parameters: the argument count `argc` and an array of -argument strings `argv`. Although the parameters are not utilized in this -example, they are typically included for potential command-line functionality. - -### Retrieving Encoding Data - -The key operation occurs in the following block: - -```c -const lxb_encoding_data_t *enc_data; enc_data = lxb_encoding_data_by_name((lxb_char_t *) "uTf-8", 5); +if (enc_data == NULL) { + return EXIT_FAILURE; +} ``` -In this segment, the variable `enc_data` is declared as a pointer to -`lxb_encoding_data_t`, which represents the encoding data structure in `lexbor`. -The function `lxb_encoding_data_by_name` is called with two arguments: the -string "uTf-8" (with a deliberate mixed case) and the length of the string, -which is `5`. +Here, `lxb_encoding_data_by_name` is called with the encoding name "uTf-8" +and its length (5). This function is designed to return a pointer to +`lxb_encoding_data_t` which contains information about the encoding. -This function attempts to retrieve encoding data corresponding to the specified -name. If the name provided does not match any available encoding in the library, -the function will return `NULL`. +- **Function Call**: `lxb_encoding_data_by_name` converts the provided name + to a canonical form and searches for its associated encoding data. +- **Parameters**: + - `(lxb_char_t *) "uTf-8"`: The encoding name, cast to `lxb_char_t *`. + - `5`: The length of the encoding name. +- **Return Value**: The function returns a pointer to `lxb_encoding_data_t` + if the encoding is found. If not, `NULL` is returned. ### Error Handling -The next block of code provides basic error handling: +After the encoding data is retrieved, the code checks if the returned pointer +is `NULL`: ```c if (enc_data == NULL) { @@ -54,40 +48,37 @@ if (enc_data == NULL) { } ``` -If `enc_data` is `NULL`, the program terminates with a failure status. This is -an important check to ensure that the encoding has been found before attempting -to access any of its properties, thus preventing potential runtime errors. +This ensures that the program handles the case where the encoding is not found +appropriately by exiting with `EXIT_FAILURE`. -### Output Encoding Name +### Printing Encoding Name -Upon successful retrieval of the encoding data, the program proceeds to print -the name of the encoding: +If the encoding is found, the name of the encoding is printed out: ```c printf("%s\n", enc_data->name); ``` -This line outputs the name of the encoding that has been retrieved, which in -this case would be "UTF-8", assuming the spelling was correct in the function -call. +`enc_data->name` holds the canonical encoding name. This line demonstrates +how to access and use the information within the `lxb_encoding_data_t` structure. -### Exit Status +## Notes -Finally, the program completes its execution successfully: - -```c -return EXIT_SUCCESS; -``` +- **Case Insensitivity**: The function `lxb_encoding_data_by_name` is + case-insensitive, as evidenced by the mixed-case input "uTf-8". +- **Canonical Form**: The returned encoding name is ensured to be in + a standard canonical form. +- **Static Data**: The encoding names and their data are typically + static and predefined within `lexbor`. -This line returns a success status to the operating system, indicating that the -program has run without any issues. +## Summary -## Conclusion +This example highlights how to use the `lexbor` library to look up encoding +data by name. By invoking `lxb_encoding_data_by_name`, users can retrieve +information about specific encodings efficiently. Understanding this process +is vital for applications that handle diverse text encodings, ensuring proper +text interpretation and conversion. -The example presented in -[lexbor/encoding/data_by_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/data_by_name.c) -effectively demonstrates how to access encoding data using the `lexbor` encoding -library. It showcases the importance of error handling and provides a simple way -to retrieve and display the name of a character encoding, using UTF-8 as a -practical example. This code can serve as a foundational component for -applications that require encoding information for text processing. \ No newline at end of file +For `lexbor` users, this example provides a clear and practical method to +interact with the library’s encoding functionalities, facilitating smooth +integration into larger projects requiring robust encoding support. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/decode.md b/source/examples/encoding/single/decode/decode.md index 20cff2b..338afc2 100644 --- a/source/examples/encoding/single/decode/decode.md +++ b/source/examples/encoding/single/decode/decode.md @@ -1,145 +1,86 @@ -# UTF-8 Decoding Example +# Decoding UTF-8 to Code Points: Example -This article explains a code example from -[lexbor/encoding/single/decode/decode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decode.c), -which demonstrates how to decode a UTF-8 string into its respective code points -using the `lexbor` library. +The example provided in `lexbor/encoding/single/decode/decode.c` demonstrates +how to use the `lexbor` library to decode a UTF-8 string into its respective Unicode +code points. This process involves initializing a decoder, processing each character +in the string, and handling the decoding results. -## Introduction +## Key Code Sections -The primary purpose of this code is to decode a UTF-8 encoded string, -specifically the phrase "Привет, мир!" (which means "Hello, world!" in Russian), -into individual Unicode code points. It showcases the initialization of the -decoder, the processing of the input string, and outputting the results in a -formatted manner. +### Buffer Preparation -## Code Explanation - -### Include the Required Header - -The necessary header file is included at the beginning of the code: - -```c -#include -``` - -This header provides the necessary declarations for working with encoding -functionalities offered by lexbor. - -### Error Handling Macro - -The code defines a macro for error handling: - -```c -#define FAILED(...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - exit(EXIT_FAILURE); \ - } \ - while (0) -``` - -This macro outputs an error message to the standard error stream and exits the -program if a failure condition is met. It streamlines error handling throughout -the code. - -### Main Function - -The `main` function serves as the entry point of the program: - -```c -int main(int argc, const char *argv[]) -{ - ... -} -``` - -### Variable Declarations - -Several variables are declared to handle the decoding process, including: - -- `lxb_codepoint_t cp;`: Stores the current code point. -- `lxb_status_t status;`: Holds the status of operations. -- `lxb_encoding_decode_t decode;`: The decoder instance. -- `const lxb_encoding_data_t *encoding;`: Pointer to the encoding data. -- `const lxb_char_t *pos;`: Pointer to track the current position in the input - data. - -### Preparing the Input Buffer - -The input UTF-8 string is initialized, along with a pointer to the end of the -string: +The example starts by defining the input string in UTF-8 and preparing pointers to +iterate through this string: ```c const lxb_char_t *data = (const lxb_char_t *) "Привет, мир!"; const lxb_char_t *end = data + strlen((char *) data); ``` -The `strlen` function determines the length of the string to establish the end -of the data. +Here, `data` points to the start of the UTF-8 encoded string, and `end` points to +the address just after the last character of the string. This setup is essential +for the following decoding process. -### Setting Up the Encoding +### Initializing the Decoder -The program retrieves UTF-8 encoding data with: +Next, the example code initializes the decoder for UTF-8: ```c +const lxb_encoding_data_t *encoding; encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); -``` - -This function sets up the necessary encoding data for subsequent decoding -operations. -### Initializing the Decoder - -The decoder is initialized with: - -```c -status = lxb_encoding_decode_init_single(&decode, encoding); +lxb_status_t status = lxb_encoding_decode_init_single(&decode, encoding); if (status != LXB_STATUS_OK) { FAILED("Failed to init decoder"); } ``` -If the initialization fails, the program invokes the `FAILED` macro to print the -error and exit. +Here, `lxb_encoding_data` retrieves the data structure for the specified encoding. +Then, `lxb_encoding_decode_init_single` initializes the decoding process using +this encoding. The function checks for successful initialization and exits if it +fails. -### Decoding Loop +### Decoding the String -Following initialization, the program enters a loop to decode each character in -the input string: +The core decoding loop processes each character in the input string: ```c while (data < end) { - ... -} -``` + pos = data; -Inside the loop, the current position (`pos`) is recorded, and the decoding -function is called: + cp = encoding->decode_single(&decode, &data, end); + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + continue; + } -```c -cp = encoding->decode_single(&decode, &data, end); -``` - -This line decodes a single UTF-8 character, advancing the input pointer `data` -as needed. The result is checked against a maximum allowable code point value, -although in this example, that condition is expected never to occur. - -### Outputting the Results - -For each decoded character, the code prints the results to the standard output: - -```c -printf("%.*s: 0x%04X\n", (int) (data - pos), pos, cp); + printf("%.*s: 0x%04X\n", (int) (data - pos), pos, cp); +} ``` -This formatted output provides both the original UTF-8 character (as a -substring) and its corresponding Unicode code point in hexadecimal format. - -## Conclusion - -The example demonstrates a straightforward approach to decoding a UTF-8 string -into Unicode code points using the `lexbor` library. It effectively showcases -initialization, error handling, and character decoding, providing a practical -illustration of working with character encodings in C. \ No newline at end of file +In each iteration of the loop: +- `pos` captures the current pointer position in the string. +- `decode_single` processes the next character, updating `data` to point to the + next position. +- If `cp` (code point) is valid, it prints the UTF-8 character and its + corresponding code point. + +The loop continues until `data` reaches the `end` of the string, effectively +decoding and printing every character. + +## Notes + +- The example is hardcoded to decode a specific UTF-8 string (`"Привет, мир!"`). +- The `decode_single` function is used for simplicity, suitable for decoding one + character at a time. +- Error handling is minimal, assuming that code points will always be valid for + the given string. + +## Summary + +This example from `lexbor/encoding/single/decode/decode.c` demonstrates the basic +process of decoding a UTF-8 encoded string into Unicode code points using the lexbor +library. It initializes the decoder for UTF-8, iterates through the string, and +prints each character with its corresponding Unicode code point. This showcases +the practicality and ease of using the `lexbor` library for encoding-related tasks, +highlighting essential steps like buffer preparation, decoder initialization, +and the decoding process itself. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/decoder.md b/source/examples/encoding/single/decode/decoder.md index 1d9b108..7dc7657 100644 --- a/source/examples/encoding/single/decode/decoder.md +++ b/source/examples/encoding/single/decode/decoder.md @@ -1,94 +1,12 @@ -# Encoding Decoder Example +# Encoding Text Data with `lexbor`: Example -In this article, we will explore the encoding decoder example found in the file -[lexbor/encoding/single/decode/decoder.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/decoder.c). -This code demonstrates how to decode input data from standard input according to -a specified character encoding. It provides a useful utility for developers -needing to handle various text encodings in their applications. +The example source file `lexbor/encoding/single/decode/decoder.c` provides an in-depth look at using the `lexbor` library to decode text data from various character encodings. The primary intent of this example is to demonstrate how to initialize a decoding context, read data from standard input, and correctly handle the decoding process using the `lexbor` library. This example targets developers aiming to understand the library's capabilities for text decoding and error handling. -## Code Overview +## Key Code Sections -The main function of this code is to read data from standard input, decode it -according to the specified encoding, and print the corresponding Unicode values. -It uses the `lexbor` library to facilitate this process. +### Command-Line Argument Parsing and Usage -### Header and Includes - -At the beginning of the file, we find the licensing information and the -inclusion of the `lexbor` encoding header: - -```c -#include -``` - -This inclusion allows access to functions and definitions related to text -encoding and decoding. - -### Error Handling Macro - -A macro named `FAILED` is defined to streamline error management: - -```c -#define FAILED(with_usage, ...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - \ - if (with_usage) { \ - usage(); \ - } \ - \ - exit(EXIT_FAILURE); \ - } \ - while (0) -``` - -This macro takes a condition (`with_usage`) and, upon failure, prints an error -message to standard error, optionally displays usage instructions, and exits the -program with a failure status. This convenient encapsulation enhances code -readability and maintainability. - -### Usage Function - -Next, the `usage` function is defined: - -```c -static void usage(void) -{ - printf("Usage: decoder \n\n"); - printf("Available encodings:\n"); - ... -} -``` - -This function provides users with information about how to use the decoder -program and lists the available character encodings that can be specified as -command-line arguments. - -### Main Function Structure - -The `main` function begins with some variable declarations: - -```c -int main(int argc, const char *argv[]) -{ - size_t read_size; - lxb_status_t status; - lxb_codepoint_t cp = 0x0000; - lxb_encoding_decode_t decode; - const lxb_encoding_data_t *encoding; -``` - -**Variable Description:** -- `read_size`: To store the number of bytes read from standard input. -- `status`: To capture the success or failure of encoding operations. -- `cp`: A variable representing the code point being processed. -- `decode`: A structure for managing the decoding state. -- `encoding`: A pointer to the encoding data determined by user input. - -#### Input Validation - -The program first checks for the correct number of command-line arguments: +The program begins by checking if the correct number of command-line arguments is provided. If not, it displays the usage information and exits. ```c if (argc != 2) { @@ -97,64 +15,80 @@ if (argc != 2) { } ``` -If no encoding is specified, it invokes the `usage` function and exits -gracefully. +The `usage` function prints the expected usage of the program, including the list of supported encodings. This helps users understand how to properly invoke the decoder and which encodings are available. -#### Encoding Detection +### Encoding Initialization -Next, the program attempts to identify the desired encoding based on the -provided name: +The encoding provided by the user as a command-line argument is determined using the `lxb_encoding_data_by_pre_name` function. ```c -encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], - strlen(argv[1])); +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); if (encoding == NULL) { FAILED(true, "Failed to get encoding from name: %s\n\n", argv[1]); } -``` - -If the specified encoding is not recognized, it triggers the `FAILED` macro, -providing feedback to the user. - -#### Decoder Initialization -The decoder is then initialized: - -```c status = lxb_encoding_decode_init_single(&decode, encoding); if (status != LXB_STATUS_OK) { FAILED(false, "Failed to init decoder"); } ``` -This step configures the decoder to use the chosen encoding. If the -initialization fails, the program prints an error and exits. +Here, the program retrieves the encoding data associated with the user-provided name. If the encoding is invalid, the program exits with an error message. Once the encoding data is obtained, it initializes the decoder object with `lxb_encoding_decode_init_single`. Proper initialization is crucial for subsequently processing the incoming data. -### Data Reading and Decoding Loop +### Reading from Standard Input -The program enters a loop to read from standard input: +The main decoding loop reads data from standard input in blocks and decodes them using the initialized decoder. ```c do { read_size = fread(inbuf, 1, sizeof(inbuf), stdin); - ... - while (data < end) { - cp = encoding->decode_single(&decode, &data, end); - ... + if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } } + + data = (const lxb_char_t *) inbuf; + end = data + read_size; + + // Decoding happens here } while (loop); ``` -Within this loop: -- Data is read into a buffer (`inbuf`). -- Each code point is decoded using the `decode_single` method. -- Based on the value of `cp`, different output formats are printed for Unicode - and ASCII characters. +The input data is read in chunks and processed in a loop. The `fread()` function reads up to `sizeof(inbuf)` bytes from the standard input. If the read size is different (and the end of the file is not reached), it indicates an error. + +### Decoding Loop + +Inside the decoding loop, the program calls the decoder's `decode_single` method to decode individual characters. + +```c +while (data < end) { + cp = encoding->decode_single(&decode, &data, end); + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + if (cp == LXB_ENCODING_DECODE_CONTINUE) { + break; + } + printf("\\u%04X", LXB_ENCODING_REPLACEMENT_CODEPOINT); + continue; + } + + if (cp >= 0x00A0) { + printf("\\u%04X", cp); + } + else { + printf("\\x%02X", cp); + } +} +``` + +Here, `decode_single` decodes characters from the input buffer and manages input pointer `data`. Special handling is implemented for cases when the code point indicates a continuation (`LXB_ENCODING_DECODE_CONTINUE`) or an invalid character. Valid Unicode characters are printed in `\u` format, while ASCII characters are printed in `\x` format. -### Output and Continuation +### Handling Remaining Unfinished Decodings -Finally, the program checks if the decoding process requires continuation, -outputting a replacement character where necessary: +After the loop, if there's an indication that decoding was incomplete (i.e., if `cp` equals `LXB_ENCODING_DECODE_CONTINUE`), the program outputs a Unicode replacement character. ```c if (cp == LXB_ENCODING_DECODE_CONTINUE) { @@ -162,9 +96,14 @@ if (cp == LXB_ENCODING_DECODE_CONTINUE) { } ``` -### Conclusion +This ensures that any unfinished multi-byte sequences are handled gracefully. + +## Notes + +1. **Error Handling**: The macro `FAILED` is used extensively to simplify error messages and includes conditional usage guidance. +2. **Buffer Management**: The program efficiently manages input data using a fixed-size buffer, ensuring that large input streams are handled correctly. +3. **Decoding Logic**: The implementation highlights robust decoding logic that appropriately handles different character encoding cases, including Unicode and ASCII conversions. + +## Summary -By effectively using the `lexbor` library's encoding functionalities, this code -provides a flexible and powerful example of how to decode various text encodings -from standard input. Developers can adapt this example for their applications, -thereby enhancing their ability to handle encoded text data efficiently. \ No newline at end of file +This decoding example from the `lexbor` library demonstrates essential techniques for initializing encoding contexts, reading and decoding text data, and handling various edge cases. Being equipped with such knowledge allows developers to leverage `lexbor` for efficient and accurate character encoding transformation tasks across different applications. \ No newline at end of file diff --git a/source/examples/encoding/single/decode/validate.md b/source/examples/encoding/single/decode/validate.md index bb63386..4f7a3bf 100644 --- a/source/examples/encoding/single/decode/validate.md +++ b/source/examples/encoding/single/decode/validate.md @@ -1,92 +1,102 @@ -# UTF-8 Decoding and Validation Example +# UTF-8 String Decoding and Validation: Example -This article explains an example of decoding and validating a UTF-8 string, -using the `lexbor` library. The source file for this code example is -[lexbor/encoding/single/decode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/decode/validate.c). -The primary objective of this code is to demonstrate how to properly decode a -UTF-8 encoded string, handle decoding errors, and output both valid code points -and error information for invalid byte sequences. +This article explains a demonstrative code file `lexbor/encoding/single/decode/validate.c` that +decodes and validates a UTF-8 encoded string to code points using the `lexbor` library. The example +focuses on initializing the decoder, processing each byte sequence in the input string to validate +and decode it, and handling invalid byte sequences. -## Code Breakdown +## Key Code Sections -The example begins with necessary includes and macro definitions. It imports the -required header file for `lexbor` encoding and defines a macro `FAILED` that -handles error reporting and terminates the program if an error occurs. +### Initialization and Setup -### Setting Up the Main Function - -The `main` function initializes variables needed for decoding. Here, -`lxb_status_t status`, `lxb_codepoint_t cp`, and `lxb_encoding_decode_t decode` -are declared. Additionally, a pointer to encoding data will be initialized as -the UTF-8 encoding. +The main function initializes the necessary variables and prepares the input buffer. This part +includes selecting the UTF-8 encoding type and initializing the decoder struct: ```c -lxb_status_t status; -lxb_codepoint_t cp; -lxb_encoding_decode_t decode; -const lxb_encoding_data_t *encoding; +int +main(int argc, const char *argv[]) +{ + lxb_status_t status; + lxb_codepoint_t cp; + lxb_encoding_decode_t decode; + const lxb_encoding_data_t *encoding; + const lxb_char_t *pos; + + /* Prepare buffer */ + const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; + const lxb_char_t *end = data + strlen((char *) data); + + encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + + status = lxb_encoding_decode_init_single(&decode, encoding); + if (status != LXB_STATUS_OK) { + FAILED("Failed to init decoder"); + } ``` -### Preparing the Data Buffer +Key points: +- `lxb_encoding_data(LXB_ENCODING_UTF_8)` retrieves data for the UTF-8 encoding. +- `lxb_encoding_decode_init_single(&decode, encoding)` initializes the decoder structure for that encoding. + +### Decoding the Input String -The code prepares a buffer containing the string "Привет,\x80 мир!". The string -contains a valid UTF-8 sequence followed by an invalid byte sequence (0x80). The -end of the buffer is determined using `strlen` to ensure the decoding process -will iterate through the entire string. +The core of the decoding process involves a loop to read each byte sequence of the input string and +convert it to Unicode code points: ```c -const lxb_char_t *data = (const lxb_char_t *) "Привет,\x80 мир!"; -const lxb_char_t *end = data + strlen((char *) data); -``` + printf("Decode and validate UTF-8 string \"%s\" to code points:\n", (char *) data); -### Initializing the Decoder + while (data < end) { + pos = data; -The encoding is initialized with `lxb_encoding_data(LXB_ENCODING_UTF_8)`, and -the decoder is set up using the function `lxb_encoding_decode_init_single`. If -initialization fails, the `FAILED` macro reports the error and exits the -program. + cp = encoding->decode_single(&decode, &data, end); + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + printf("Bad byte sequences: 0x%04X; Replaced to: 0x%04X ('%s')\n", + *pos, LXB_ENCODING_REPLACEMENT_CODEPOINT, + LXB_ENCODING_REPLACEMENT_BYTES); -```c -encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); -status = lxb_encoding_decode_init_single(&decode, encoding); -if (status != LXB_STATUS_OK) { - FAILED("Failed to init decoder"); -} + continue; + } + + printf("%.*s: 0x%04X\n", (int) (data - pos), pos, cp); + } ``` -### Decoding Process +Key points: +- The loop runs while `data` is less than `end` to process each byte sequence. +- `encoding->decode_single(&decode, &data, end)` performs the core decoding of the current byte sequence. +- If the decoded code point `cp` exceeds `LXB_ENCODING_DECODE_MAX_CODEPOINT`, it handles this invalid + byte sequence by replacing it with a predefined replacement code point and bytes. -The core loop of the example begins, where the program continuously decodes -until the end of the data buffer is reached. Each iteration decodes a single -code point from the UTF-8 data. +### Handling Invalid Byte Sequences + +When encountering invalid byte sequences, the code prints out an error message and continues: ```c -while (data < end) { - pos = data; - cp = encoding->decode_single(&decode, &data, end); -} + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + printf("Bad byte sequences: 0x%04X; Replaced to: 0x%04X ('%s')\n", + *pos, LXB_ENCODING_REPLACEMENT_CODEPOINT, + LXB_ENCODING_REPLACEMENT_BYTES); + + continue; + } ``` -If a valid code point is within the acceptable range defined by -`LXB_ENCODING_DECODE_MAX_CODEPOINT`, it gets printed together with the decoded -UTF-8 sequence. If an invalid byte sequence is encountered that exceeds the -maximum code point, it prints an error message indicating the bad byte -sequences. +Key points: +- The check `cp > LXB_ENCODING_DECODE_MAX_CODEPOINT` determines if the decoded value is valid. +- Invalid input sequences are substituted with `LXB_ENCODING_REPLACEMENT_CODEPOINT`, and an error message + is printed using the original byte. -```c -if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { - printf("Bad byte sequences: 0x%04X; Replaced to: 0x%04X ('%s')\n", - *pos, LXB_ENCODING_REPLACEMENT_CODEPOINT, - LXB_ENCODING_REPLACEMENT_BYTES); - continue; -} -``` +## Notes + +- The `lexbor` library's decoding functions must be initialized with the specific encoding data. +- Each byte sequence in the input string is validated and can be replaced if found invalid. +- The code uses a custom macro `FAILED` to handle initialization errors and terminate execution. -### Conclusion +## Summary -The program concludes by returning a success status if all decoding operations -complete without errors. In summary, this code serves as an illustrative example -of how to utilize the `lexbor` encoding library to decode and validate UTF-8 -encoded strings effectively, while properly handling potential errors in byte -sequences. By implementing this method, developers can ensure their applications -correctly interpret and display UTF-8 content. \ No newline at end of file +This example demonstrates how to decode and validate a UTF-8 encoded string using the `lexbor` +library. By initializing the decoder with UTF-8 encoding, processing each byte sequence, and +handling invalid sequences, it showcases essential functionality for anyone working with text +processing and encoding validation using `lexbor`. This provides a practical foundation for +handling encoded text robustness in applications. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encode.md b/source/examples/encoding/single/encode/encode.md index aa9c1d2..431f3ce 100644 --- a/source/examples/encoding/single/encode/encode.md +++ b/source/examples/encoding/single/encode/encode.md @@ -1,56 +1,69 @@ -# UTF-8 Encoding Example - -This article explains the purpose and functionality of the UTF-8 encoding -example provided in the file -[lexbor/encoding/single/encode/encode.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/encode.c). -The code demonstrates how to encode a series of Unicode code points into a UTF-8 -byte string using the `lexbor` encoding library. - -## Code Overview - -The program begins by including the necessary header file for the `lexbor` -encoding library. It defines a macro for error handling named `FAILED`, which -simplifies printing error messages and terminating the program if initialization -or execution fails. - -### Main Function Structure - -The `main` function serves as the entry point of the program. It declares -several variables needed for encoding, including a buffer for the output and an -encoder instance. The following key steps are involved in the encoding process: - -1. **Buffer Preparation**: A buffer of 1024 bytes is allocated to hold the UTF-8 - encoded string. The variables `data` and `end` are set to track the start and - the end of the buffer. - -2. **Unicode Code Points**: An array of Unicode code points is defined and - terminated with a zero. These code points (e.g., Cyrillic characters for - "Привет, мир!") are the values that will be encoded. - -3. **Encoding Initialization**: The function `lxb_encoding_data` retrieves the - encoding data for UTF-8, which is passed to `lxb_encoding_encode_init_single` - to initialize the encoder. If the initialization fails, the `FAILED` macro is - invoked to handle the error. - -4. **Encoding Loop**: The program enters a loop where each code point is - processed for encoding: - - The current position in the buffer (`pos`) is saved. - - The encoder's `encode_single` function is called to perform the encoding. - The length of the encoded output is returned. - - If the encoding operation is successful, the resulting UTF-8 bytes are - printed alongside their corresponding Unicode code point in hexadecimal - format. - -5. **String Termination**: After processing all code points, the buffer is - null-terminated to ensure it is properly formatted as a C string. - -6. **Output Display**: Finally, the UTF-8 encoded string is printed to the - console, demonstrating the successful encoding of the provided Unicode code - points. - -## Conclusion - -Upon reaching the end of the program, it exits gracefully, indicating successful -execution. This example illustrates how to use the `lexbor` encoding library for -converting Unicode code points to a UTF-8 encoded string, providing a clear and -practical implementation of encoding functionality in C using `lexbor`. \ No newline at end of file +# UTF-8 Encoding Example: Example + +The source file under discussion is `lexbor/encoding/single/encode/encode.c`. This example demonstrates how to encode a sequence of Unicode code points into a UTF-8 byte string using the `lexbor` library. The example covers the initialization of the encoding process, the encoding of individual Unicode code points, and the final assembly of the encoded string. + +## Key Code Sections + +### Initialization of Buffer and Encoding Setup + +First, the code initializes the buffer and sets up the encoding structure. + +```c +lxb_char_t buffer[1024]; +lxb_char_t *data = buffer; +const lxb_char_t *end = data + sizeof(buffer); + +// Unicode code points for encoding +lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, 0x002C, + 0x0020, 0x043C, 0x0438, 0x0440, 0x0021, 0}; + +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + +status = lxb_encoding_encode_init_single(&encode, encoding); +if (status != LXB_STATUS_OK) { + FAILED("Failed to init encoder"); +} +``` +The buffer array serves as a container for the resulting UTF-8 byte string. The Unicode code points for "Привет, мир!" are specified in the `cps` array. The `lxb_encoding_data` function retrieves the encoding data for UTF-8, and `lxb_encoding_encode_init_single` initializes the `encode` structure for single character encoding. + +### Encoding Loop + +The next portion of the code encodes each Unicode code point and prints the results. + +```c +printf("Encode code points to UTF-8 byte string:\n"); + +for (size_t i = 0; cps[i] != 0; i++) { + pos = data; + + len = encoding->encode_single(&encode, &data, end, cps[i]); + if (len < LXB_ENCODING_ENCODE_OK) { + continue; + } + + printf("0x%04X: %.*s\n", cps[i], len, pos); +} +``` +Within the loop, `pos` stores the current position of `data`. The `encode_single` method encodes each code point into the buffer. `len` will be the number of bytes written, and the encoded representation of each code point is printed in hexadecimal. + +### Finalizing the Encoded String + +Finally, the code terminates the string and prints the result. + +```c +*data = 0x00; + +printf("\nResult: %s\n", (char *) buffer); +``` +Adding a null terminator `0x00` to the buffer ensures it is a well-formed C string. The full UTF-8 encoded result is then printed. + +## Notes + +1. **Buffer Initialization**: The buffer's size ensures that it can contain the encoded string, preventing overflow. +2. **Encoder Initialization**: The `lxb_encoding_encode_init_single` function is essential for setting up the encoding process. +3. **Error Handling**: The code handles potential encoding errors, although they are not expected in this specific example. +4. **String Termination**: Proper string termination is necessary for safe string operations in C. + +## Summary + +This example showcases how to encode Unicode code points into a UTF-8 byte string using the `lexbor` library. It highlights buffer management, the encoding process, and error handling. This is a useful reference for developers needing to perform character encoding tasks with lexbor, demonstrating critical library functions and proper C programming practices. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/encoder.md b/source/examples/encoding/single/encode/encoder.md index 4371536..dcac8d6 100644 --- a/source/examples/encoding/single/encode/encoder.md +++ b/source/examples/encoding/single/encode/encoder.md @@ -1,96 +1,117 @@ -# Encoding Input Data Example +# Encoding Input Strings to a Specified Encoding: Example -This article explains the purpose and functionality of the `encoder.c` source -file located in the `lexbor/encoding/single/encode` directory. The code provides -a utility for encoding text input based on a specified character encoding -scheme. It reads data from standard input (stdin), decodes any escaped code -points in the input, and encodes the results according to the selected encoding. +This example in `lexbor/encoding/single/encode/encoder.c` demonstrates how to use the `lexbor` library to encode input strings to a specified encoding. The source file `encoder.c` provides a comprehensive example of how to handle encoding using the `lexbor` encoding library. This involves initializing the encoder, reading from standard input, processing escaped code points, and outputting the result in the specified encoding. -## Key Components +## Key Code Sections -### Header and Macros +### 1. Getting the Encoding -The file begins with some header information including copyright and the -author's details. Following this, necessary includes and definitions are placed. -The macro `FAILED` is defined to handle error reporting and exit when a critical -failure occurs. This block of code succinctly prints an error message, displays -usage instructions if required, and terminates the program: +The first key step is to determine the encoding based on the command-line argument provided by the user. ```c -#define FAILED(with_usage, ...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - if (with_usage) { \ - usage(); \ - } \ - exit(EXIT_FAILURE); \ - } \ - while (0) +if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); +} + +encoding = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], + strlen(argv[1])); +if (encoding == NULL) { + FAILED(true, "Failed to get encoding from name: %s\n", argv[1]); +} +``` + +This section reads the encoding name from the command line and retrieves the corresponding encoding data using `lxb_encoding_data_by_pre_name()`. If the encoding is not found, it prints an error message and exits. + +### 2. Initializing the Encoder + +Once the encoding is determined, we initialize the single byte encoder. + +```c +status = lxb_encoding_encode_init_single(&encode, encoding); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to init encoder"); +} ``` -### Usage Function +This initializes an encoder for the specified encoding using `lxb_encoding_encode_init_single()`. If initialization fails, the program exits with an error message. -The `usage` function outputs the required command-line usage for the program, -listing all of the available encodings such as `UTF-8`, `ISO-8859-1`, and -`SHIFT-JIS`. This function is invoked if the user does not supply the required -arguments. +### 3. Processing Input Data + +The main loop reads from the standard input and processes each chunk of data. ```c -static void usage(void) { - printf("Usage: encoder \n\n"); - // List of available encodings... +do { + read_size = fread(inbuf, 1, sizeof(inbuf), stdin); + if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } + } + + data = (const lxb_char_t *) inbuf; + end = data + read_size; + + while (data < end) { + data = escaped_to_codepoint(data, end, &cp, &state); + if (state != 0) { + if (loop || state != 3) { + break; + } + + state = 0; + } + + out = outbuf; + len = encoding->encode_single(&encode, &out, out_end, cp); + if (len < LXB_ENCODING_ENCODE_OK) { + if (len == LXB_ENCODING_ENCODE_SMALL_BUFFER) { + FAILED(false, "Failed to convert code point to bytes"); + } + + if (encoding->encoding == LXB_ENCODING_UTF_8) { + printf("%s", LXB_ENCODING_REPLACEMENT_BYTES); + continue; + } + + printf("?"); + continue; + } + + if (fwrite(outbuf, 1, len, stdout) != len) { + FAILED(false, "Failed to write data to stdout"); + } + } } +while (loop); ``` -### Escaped Code Point Conversion +This loop reads input data, processes it to convert code points to the target encoding, and then writes the result to the standard output. `escaped_to_codepoint()` is used to handle escape sequences in the input. + +### 4. Handling Escape Sequences -The function `escaped_to_codepoint` is responsible for converting escaped -Unicode sequences to their corresponding code points. The function processes the -input data character by character, identifying whether the sequence starts with -a backslash, and checking for either hexadecimal (`\x`) or Unicode (`\u`) -formats. If an incorrectly formatted escape sequence is detected, an error state -is triggered prompting the program to exit: +The function `escaped_to_codepoint()` processes escaped code points from the input data, converting them into `lxb_codepoint_t`. ```c -static const lxb_char_t *escaped_to_codepoint(const lxb_char_t *data, const lxb_char_t *end, +static const lxb_char_t * +escaped_to_codepoint(const lxb_char_t *data, const lxb_char_t *end, lxb_codepoint_t *cp, int8_t *state) { - // Processing logic... - if (*data != '\\') { - goto failed; // Handle invalid start of escape sequence - } - // More processing... + ... } ``` -### Main Functionality - -The `main` function orchestrates the entire encoding process: - -1. **Argument Handling**: It requires one argument indicating the desired - encoding. -2. **Encoding Setup**: It retrieves the encoding configuration using the - provided argument and initializes the encoder. -3. **Input Loop**: The program enters a loop where it reads input data from - stdin, processes it into code points, and then encodes these points: - - ```c - while (data < end) { - data = escaped_to_codepoint(data, end, &cp, &state); - // Encoding logic... - } - ``` - -4. **Output Handling**: The encoded output is written to stdout. If the encoding - is `UTF-8`, replacement bytes are used as necessary. - -Overall, the program is designed to robustly handle input encoding, managing -possible errors during reading and writing, and validating formats. The use of -the `lexbor` library enables effective encoding management, providing a variety -of supported character encodings. - -In conclusion, the `encoder.c` file serves as a practical example of encoding -conversion using a command-line utility, highlighting important coding -principles, such as error handling, input/output operations, and state -management within the context of encoding mechanisms. \ No newline at end of file +This function manages the state of escape processing, ensuring that sequences are correctly translated into code points. + +## Notes + +- The use of the `FAILED()` macro simplifies error handling by printing an error message and exiting if necessary. +- This example handles a variety of encodings and demonstrates the flexibility of the `lexbor` library in encoding text data. +- Careful state management throughout the processing ensures robustness, especially when handling partial or malformed escape sequences. + +## Summary + +This example emphasizes how to use the `lexbor` library to convert input strings into a specified encoding. It covers initialization, processing input in chunks, handling escape sequences, and ensuring the encoded output is correctly written. This illustration is vital for developers looking to integrate robust encoding capabilities in their applications using `lexbor`. \ No newline at end of file diff --git a/source/examples/encoding/single/encode/validate.md b/source/examples/encoding/single/encode/validate.md index 93cd26d..363040c 100644 --- a/source/examples/encoding/single/encode/validate.md +++ b/source/examples/encoding/single/encode/validate.md @@ -1,75 +1,112 @@ -# Encoding Unicode Code Points to UTF-8 Example +# Encode and Validate Unicode Code Points: Example -This example demonstrates how to validate and encode Unicode code points into a -UTF-8 byte string using the `lexbor` library. The functionality is encapsulated -within a C program located in the -[lexbor/encoding/single/encode/validate.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/encode/validate.c) -file. The purpose of this code is to illustrate the encoding of a set of given -code points, handling exceptions for those that are invalid by replacing them -with a predefined replacement character. +This article explains the code from the `lexbor` library in the file `lexbor/encoding/single/encode/validate.c`. The example demonstrates how to encode a sequence of Unicode code points into a UTF-8 byte string and handle validation of those points, especially focusing on dealing with invalid Unicode code points. -## Overview of the Code +The example code shows how to use the `lexbor` library to encode an array of Unicode code points into UTF-8. It includes the crucial steps of initializing the encoder, iterating through the Unicode code points, encoding each point, handling errors, replacing invalid code points, and finally, outputting the encoded string. -The code begins by including the necessary header files from the `lexbor` library, -specifically targeting encoding functionality. It subsequently defines a macro -for error handling, which outputs an error message to `stderr` and exits the -program with a failure status. +## Key Code Sections -### Variable Declarations +### Buffer Preparation -The `main` function sets up various variables needed for the encoding process: +The code prepares the buffer that will hold the encoded UTF-8 byte string: -- `len`: This variable stores the length of the encoded string. -- `status`: Utilized for capturing the status of encoding operations. -- `encode`: An instance of `lxb_encoding_encode_t`, used to manage encoding - context. -- `encoding`: A pointer to the appropriate encoding data. -- `pos`: A pointer that tracks the current position in the output buffer. +```c +/* Prepare buffer */ +lxb_char_t buffer[1024]; +lxb_char_t *data = buffer; +const lxb_char_t *end = data + sizeof(buffer); +``` -### Buffer Preparation +Here, `buffer` is a fixed-size array where the encoded UTF-8 data will be stored. `data` is a pointer that will be adjusted as data is written into the buffer, and `end` marks the endpoint of the buffer to prevent overflow. + +### Defining Unicode Code Points + +A set of Unicode code points, including an invalid one, is defined for encoding: + +```c +/* Unicode code points for encoding */ +lxb_codepoint_t cps[] = {0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, + 0x002C, + 0x110000, /* <-- bad code point */ + 0x0020, 0x043C, 0x0438, 0x0440, 0x0021, 0}; +``` + +This array includes a mix of valid Unicode code points and an intentionally invalid code point (`0x110000`). The `0` at the end signifies the end of the array. + +### Initialize Encoder + +An encoder for the UTF-8 encoding is initialized: + +```c +encoding = lxb_encoding_data(LXB_ENCODING_UTF_8); + +status = lxb_encoding_encode_init_single(&encode, encoding); +if (status != LXB_STATUS_OK) { + FAILED("Failed to init encoder"); +} +``` + +The `lxb_encoding_data` function fetches the encoding data structure for UTF-8, and `lxb_encoding_encode_init_single` initializes the single-byte encoder context. An error check ensures that the encoder was initialized successfully. + +### Encoding and Validation + +The code iterates over the Unicode code points array to validate and encode each point: + +```c +for (size_t i = 0; cps[i] != 0; i++) { + pos = data; + + len = encoding->encode_single(&encode, &data, end, cps[i]); + + if (len < LXB_ENCODING_ENCODE_OK) { + if (len == LXB_ENCODING_ENCODE_SMALL_BUFFER) { + break; + } + + printf("Bad code point: 0x%04X; Replaced to: %s (0x%04X)\n", + cps[i], LXB_ENCODING_REPLACEMENT_BYTES, + LXB_ENCODING_REPLACEMENT_CODEPOINT); + + memcpy(data, LXB_ENCODING_REPLACEMENT_BYTES, + LXB_ENCODING_REPLACEMENT_SIZE); + + data += LXB_ENCODING_REPLACEMENT_SIZE; + + continue; + } -A buffer (`buffer`) of 1024 `lxb_char_t` elements is defined to hold the -resulting UTF-8 byte string. Pointers are initialized to manage the writing -process into this buffer safely. + printf("0x%04X: %.*s\n", cps[i], len, pos); +} +``` -### Unicode Code Points +For each code point: -An array of Unicode code points is declared, which includes both valid and an -intentionally invalid code point (`0x110000`). This is to illustrate how the -code handles bad input during encoding. +1. `pos` marks the initial position in the buffer. +2. `encoding->encode_single` attempts to encode the current code point. +3. If the return value `len` indicates an error: + - It checks if the buffer is too small (the code handles it theoretically, though it never occurs here due to enough buffer space). + - For invalid code points, it replaces them with a predefined replacement character (commonly `0xFFFD` in UTF-8). +4. If the encoding is successful, `len` specifies the number of bytes written. -### Encoding Initialization +### Final Output -The code retrieves the UTF-8 encoding data using -`lxb_encoding_data(LXB_ENCODING_UTF_8)` and initializes the encoding context -with `lxb_encoding_encode_init_single(&encode, encoding)`. If this -initialization fails, an error message is reported, and the program exits. +The result is terminated with a null character and printed: -### Encoding Loop +```c +/* Terminate string */ +*data = 0x00; -The core functionality is encapsulated in a loop that processes each code point -from the `cps` array: +printf("\nResult: %s\n", (char *) buffer); +``` -1. **Position Tracking**: The position pointer `pos` is reset to the current - data pointer at the start of the loop iteration. -2. **Encoding**: Each code point is encoded using the `encode_single` method. - The returned `len` represents the number of bytes written to the buffer. -3. **Error Handling**: If `len` indicates a problem (less than - `LXB_ENCODING_ENCODE_OK`), the code checks for buffer size issues (though - this example does not expect to encounter this). If the code point is - invalid, it prints an error message along with a replacement character - output, handling the invalid code point scenario gracefully. -4. **Output**: For valid code points, the program prints the code point and its - corresponding UTF-8 representation. +This step ensures the buffer is a valid C string and outputs the final encoded string. -### Finalization +## Notes -After processing all code points, the program terminates the string by setting -the last byte of the buffer to `0x00`. It then prints the final UTF-8 result. +- The example uses `lexbor`'s encoding library for UTF-8 encoding. +- Error handling is implemented to manage invalid Unicode code points. +- The buffer is large enough to handle the encoded output, avoiding buffer overflow concerns in this context. -## Conclusion +## Summary -The program effectively showcases how to handle Unicode encoding with proper -error management for invalid inputs. This example is particularly useful for -developers using the `lexbor` library to manage character encodings, providing -insight on validating and encoding procedures in C. \ No newline at end of file +This example demonstrates the usage of the `lexbor` library for encoding Unicode code points into UTF-8, handling errors gracefully, and replacing invalid code points. It highlights lexbor's flexibility and robustness in dealing with text encoding tasks, proving indispensable for applications needing precise control over encoding processes. By understanding this example, developers can leverage lexbor's capabilities for their encoding needs, ensuring correct handling and encoding of text data. \ No newline at end of file diff --git a/source/examples/encoding/single/from_to.md b/source/examples/encoding/single/from_to.md index c98b585..2a0fb72 100644 --- a/source/examples/encoding/single/from_to.md +++ b/source/examples/encoding/single/from_to.md @@ -1,132 +1,152 @@ -# Encoding Conversion Example +# Text Conversion Through Custom Encodings: Example -This article explains the encoding conversion functionality provided in the -source file -[lexbor/encoding/single/from_to.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/encoding/single/from_to.c). -The code allows users to convert text from one character encoding to another via -command-line input. It demonstrates how to utilize the `lexbor` encoding library -for encoding and decoding different formats of character sets. +This article will provide an in-depth explanation of the `lexbor/encoding/single/from_to.c` +example file. The intent of this example is to demonstrate how the `lexbor` library can be +used to read input text in one character encoding, decode it, and then encode it to another +character encoding before writing it out. The article will break down the important sections +of the code, explain the functionality provided by the `lexbor` library, and present +key insights for potential users. -## Overview +The example code uses the `lexbor` library to +create a program that reads text input in one encoding, decodes it to a universal codepoint +representation, and re-encodes it to a different encoding before outputting it. This process +involves setting up encoding and decoding specifications, handling I/O efficiently, and +managing edge cases in encoding conversion. -The main function in this code receives two command-line arguments representing -the source (`from`) and target (`to`) encodings. It reads input data from -standard input, decodes it from the specified `from` encoding to Unicode code -points, and then encodes those code points into the specified `to` encoding -before writing the output to standard output. +## Key Code Sections -## Code Breakdown +### Command-Line Argument Processing -### Definitions and Includes - -At the beginning of the file, we include the necessary header for the `lexbor` -encoding module: - -```c -#include -``` - -This allows us access to various functions and types defined in the library, -which facilitate character encoding tasks. - -### Failure Handling Macro - -The `FAILED` macro is defined for error handling throughout the code: - -```c -#define FAILED(with_usage, ...) ... -``` - -This macro simplifies error reporting by printing error messages to standard -error and conditionally calling the `usage` function to display usage -instructions before terminating the program. Adopting this macro ensures a -consistent approach to error handling across the code. - -### Usage Function - -The `usage` function provides instructions on how to use the encoding conversion -tool: +The program begins by checking if the correct number of command-line arguments are provided, +which represent the 'from' and 'to' encodings. ```c -static void usage(void) { ... } +if (argc != 3) { + usage(); + exit(EXIT_SUCCESS); +} + +/* Get encoding data for 'from' */ +from = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[1], strlen(argv[1])); +if (from == NULL) { + FAILED(true, "Failed to get encoding from name: %s", argv[1]); +} + +/* Get encoding data for 'to' */ +to = lxb_encoding_data_by_pre_name((const lxb_char_t *) argv[2], strlen(argv[2])); +if (to == NULL) { + FAILED(true, "Failed to get encoding from name: %s", argv[2]); +} ``` -It lists the accepted input encodings that users can specify when executing the -program. This function is crucial for user guidance, ensuring that they know the -correct format for command inputs. +Here, the `lxb_encoding_data_by_pre_name` function retrieves the encoding data based on the +provided name. If the encoding data cannot be found, the program exits with an error. -### Main Function +#### Initializing Encoders and Decoders -The `main` function orchestrates the overall process: +The code initializes the encoding and decoding structures provided by the `lexbor` library. ```c -int main(int argc, const char *argv[]) { ... } +status = lxb_encoding_encode_init_single(&encode, to); +if (status != LXB_STATUS_OK) { + FAILED(true, "Failed to init encoder"); +} + +status = lxb_encoding_decode_init_single(&decode, from); +if (status != LXB_STATUS_OK) { + FAILED(true, "Failed to init decoder"); +} ``` -1. **Argument Count Check**: The function starts by checking if the user - provided exactly two arguments (the source and target encodings). If not, the - `usage` function is called, and the program exits. - -2. **Encoding Data Retrieval**: The code fetches the encoding information for - both the source and target encodings using the - `lxb_encoding_data_by_pre_name` function: - - ```c - from = lxb_encoding_data_by_pre_name(...); - to = lxb_encoding_data_by_pre_name(...); - ``` - - If either retrieval fails, the `FAILED` macro is triggered, stopping - execution. - -3. **Initialization of Encoder and Decoder**: The encoder and decoder are - initialized with the retrieved encoding data: - - ```c - status = lxb_encoding_encode_init_single(&encode, to); - status = lxb_encoding_decode_init_single(&decode, from); - ``` +The `lxb_encoding_encode_init_single` and `lxb_encoding_decode_init_single` functions prepare +the encoder and decoder for the specified encodings. Handling their status ensures proper +resource initialization before processing input. - These initializations set up the necessary state for encoding and decoding - operations. +### Reading and Processing Input Data -### Input Reading and Processing Loop - -The program enters a loop where it continuously reads from standard input until -EOF (End Of File) is reached: +The core logic of reading input data, decoding it, transforming it to a codepoint and re-encoding +is encapsulated in a loop that handles data in chunks. ```c do { + /* Read standard input */ read_size = fread(inbuf, 1, sizeof(inbuf), stdin); - ... -} while (loop); + if (read_size != sizeof(inbuf)) { + if (feof(stdin)) { + loop = false; + } + else { + FAILED(false, "Failed to read stdin"); + } + } + + /* Decode incoming data */ + data = (const lxb_char_t *) inbuf; + end = data + read_size; + + while (data < end) { + /* Decode */ + cp = from->decode_single(&decode, &data, end); + if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { + if (cp == LXB_ENCODING_DECODE_CONTINUE && loop) { + break; + } + cp = LXB_ENCODING_REPLACEMENT_CODEPOINT; + } + + /* Encode */ + out = outbuf; + len = to->encode_single(&encode, &out, out_end, cp); + if (len < LXB_ENCODING_ENCODE_OK) { + printf("?"); + continue; + } + + if (fwrite(outbuf, 1, len, stdout) != len) { + FAILED(false, "Failed to write data to stdout"); + } + } +} +while (loop); ``` -Within the loop: - -- The fetched data is decoded using the `from` encoder to obtain Unicode code - points. - -- For each code point decoded, it is then encoded with the `to` encoder and - written to standard output. +The input is read in chunks of 4096 bytes, decoded character by character to codepoints, and +then re-encoded using the target encoding. Any decoding errors result in a replacement codepoint +being used, while encoding errors default to printing a question mark (`?`). -### Finalization +### Finalizing Encoding and Decoding -After processing all input data, the code finalizes the decoder and encoder: +Finally, the program ensures that any remaining buffer data is handled by finalizing the +decoding and encoding processes. ```c status = lxb_encoding_decode_finish_single(&decode); +if (status != LXB_STATUS_OK) { + printf("?"); +} + +out = outbuf; len = lxb_encoding_encode_finish_single(&encode, &out, out_end); +if (len != 0) { + if (fwrite(outbuf, 1, len, stdout) != len) { + FAILED(false, "Failed to write data to stdout"); + } +} ``` -These finalization steps ensure that any remaining data is processed and that -resources are cleaned up properly before the program exits. +These steps ensure that any buffered data is properly flushed out before program termination. + +## Notes + +1. The program supports a wide range of encodings, making it a versatile tool for encoding conversion. +2. Error handling and edge cases are managed to ensure the program does not crash on unexpected input. +3. The `lexbor` library provides comprehensive functions for encoding and decoding, making such + conversions straightforward. -## Conclusion +## Summary -The `from_to.c` example illustrates a practical approach to character encoding -conversion using the `lexbor` encoding library. It showcases error handling, user -guidance, and processing loops, making it a valuable reference for developers -needing to handle various text encodings in their applications. This example -emphasizes the importance of robust input handling and clean output generation -within character encoding operations. \ No newline at end of file +This example highlights how the `lexbor` library can be used to build a robust encoding conversion +tool. The key takeaways include understanding how to initialize encoding and decoding structures, +process input data efficiently, handle error cases gracefully, and ensure that conversions +are completed correctly before program exit. Such an understanding can facilitate building +more sophisticated text processing tools using the `lexbor` library. \ No newline at end of file diff --git a/source/examples/html/document_parse.md b/source/examples/html/document_parse.md index 398107f..8923dbd 100644 --- a/source/examples/html/document_parse.md +++ b/source/examples/html/document_parse.md @@ -1,49 +1,14 @@ -# HTML Document Parsing Example +# Parsing an HTML Document: Example -This article explains an example of parsing an HTML document using the `lexbor` -library. The purpose of this example, located in the source file -[lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c), -is to illustrate the steps necessary to create an HTML document, parse a string -of HTML, and serialize the resulting DOM tree. +In this example, located in the `lexbor/html/document_parse.c` file, we see a typical usage scenario of the `lexbor` library for parsing an HTML document. This example demonstrates the creation of an HTML document object, basic parsing of HTML content, and serialization of the resulting DOM tree. -## Example Overview +The example provides a clear, concise illustration of how to initialize and use the `lexbor` library to parse an HTML document. The example highlights crucial library functions and demonstrates error handling during document creation and HTML parsing. We will analyze several important sections of the code to understand its workings. -The example demonstrates the following key steps: +## Key Code Sections -1. **Creating the HTML Document**: Initializing a new HTML document. -2. **Parsing the HTML**: Taking an HTML string and processing it to generate a - DOM tree. -3. **Outputting the Results**: Printing the original HTML and the resulting DOM - structure. -4. **Cleaning Up**: Destroying the document to free allocated resources. +### Creating an HTML Document -## Code Explanation - -### Main Function - -The program starts in the `main` function, where it declares a variable for the -document status and a pointer to the HTML document. - -```c -lxb_status_t status; -lxb_html_document_t *document; -``` - -### Defining HTML Content - -A static character array contains the HTML to be parsed. The length of this HTML -string is also calculated. - -```c -static const lxb_char_t html[] = "

blah-blah-blah

"; -size_t html_len = sizeof(html) - 1; -``` - -### Document Initialization - -The next segment involves initializing a new HTML document using the -`lxb_html_document_create` function. This function allocates necessary memory -and sets up internal structures to hold the document data. +First, the code initializes the `lexbor` HTML document object. This is important because the document object forms the anchor point for subsequent parsing and manipulation operations. ```c document = lxb_html_document_create(); @@ -52,14 +17,11 @@ if (document == NULL) { } ``` -If the document creation fails, an error message is printed, allowing for -debugging. +Here, the `lxb_html_document_create()` function is called to allocate and initialize a new HTML document object. If the allocation fails, the program prints an error message and terminates. -### HTML Parsing +### Parsing the HTML -Once the document is created, the program parses the HTML content. The -`lxb_html_document_parse` function is responsible for parsing the input HTML -string. +Next, the example proceeds to parse a static HTML string. ```c status = lxb_html_document_parse(document, html, html_len); @@ -68,48 +30,39 @@ if (status != LXB_STATUS_OK) { } ``` -If the status indicates a failure, an appropriate message is shown. This -rigorous checking ensures that errors during parsing do not go unnoticed. +The `lxb_html_document_parse()` function is used to parse the HTML content. The function takes the document object, a pointer to the HTML data, and the length of this data. If parsing fails (indicated by a status other than `LXB_STATUS_OK`), an error message is printed, and the program halts. -### Output the Results +### Outputting the Parsed Content -After successfully parsing the HTML, the program prints the original HTML string -and serializes the resulting DOM tree. The `PRINT` macro is used for outputting -the HTML content. +To aid understanding, the code prints both the original HTML content and the resulting parsed DOM tree. ```c PRINT("HTML:"); PRINT("%s", (const char *) html); ``` -It then calls a serialization function to visualize the structure of the parsed -HTML document: - ```c PRINT("\nHTML Tree:"); serialize(lxb_dom_interface_node(document)); ``` -This step helps developers understand how the HTML input is translated into a -DOM tree structure, which is crucial for many web development tasks. +The `PRINT` macro is used to output the HTML content and the resulting DOM tree. The `serialize` function (not fully shown in the excerpt) is responsible for serializing the DOM tree to a human-readable format, providing insight into the structure of the parsed document. -### Document Cleanup +### Cleaning Up -Finally, the program cleans up by destroying the HTML document to avoid memory -leaks. This is done using the `lxb_html_document_destroy` function: +Finally, the example demonstrates proper resource management by destroying the created HTML document. ```c lxb_html_document_destroy(document); ``` -Ensuring proper resource management is important in C programming, as it helps -maintain system performance and stability. +This call to `lxb_html_document_destroy()` ensures that all resources allocated to the document object are released, preventing memory leaks. + +## Notes + +- **Error Handling**: The example employs a clear error handling strategy, checking the success of crucial library calls and halting execution when failures occur. +- **Serialization**: The use of a custom `serialize` function (assumed to be defined elsewhere in the code) helps visualize the resulting DOM tree, which is beneficial for both debugging and learning purposes. -## Conclusion +## Summary -The example provided in -[lexbor/html/document_parse.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse.c) -serves as a clear demonstration of how to create, parse, and handle an HTML -document using `lexbor`. Through careful initialization, parsing, result -outputting, and cleanup, this code illustrates best practices for managing HTML -documents in a C environment. \ No newline at end of file +This example code from `lexbor/html/document_parse.c` serves as an excellent starting point for understanding basic document parsing using the `lexbor` library. It covers essential aspects such as initialization, parsing, and cleanup, while also demonstrating how to handle errors effectively. Typical `lexbor` users can draw valuable insights from this example to incorporate into their own projects, particularly concerning proper resource management and direct interaction with the HTML DOM. \ No newline at end of file diff --git a/source/examples/html/document_parse_chunk.md b/source/examples/html/document_parse_chunk.md index 7dc7f92..84ae161 100644 --- a/source/examples/html/document_parse_chunk.md +++ b/source/examples/html/document_parse_chunk.md @@ -1,32 +1,12 @@ -# HTML Document Parsing Example +# Parsing HTML in Chunks with lexbor: Example -This article provides an overview of an example implementation of HTML document -parsing using the `lexbor` library. The example is located in the source file -[lexbor/html/document_parse_chunk.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_parse_chunk.c). -This example demonstrates how to create an HTML document, parse it in chunks, -and handle the cleaning up of allocated resources. +This article provides a detailed examination of `lexbor/html/document_parse_chunk.c`, a C code example demonstrating how to parse HTML content in chunks using the `lexbor` library. Parsing HTML in chunks can be particularly useful when dealing with streaming data, allowing for efficient and incremental data processing. -## Code Overview +## Key Code Sections -The primary function of the code is to illustrate how to process HTML content in -segments, allowing for a more flexible parsing technique suitable for scenarios -where full documents may not be available in one piece. This chunk-based parsing -can be particularly useful for streaming applications or when handling very -large HTML documents. +### Initialization of the HTML Document -### Initialization - -At the beginning of the `main` function, several essential variables are -declared, including a status variable of type `lxb_status_t` and a pointer to a -`lxb_html_document_t`, which will represent our HTML document. - -```c -lxb_html_document_t *document; -``` - -The `lxb_html_document_create()` function is called to create an instance of the -HTML document. It is essential to check whether the document was created -successfully. +The first critical section of this example is the initialization of the HTML document object: ```c document = lxb_html_document_create(); @@ -35,41 +15,30 @@ if (document == NULL) { } ``` -If the document creation fails, the program will exit, indicating an error. +Here, the `lxb_html_document_create()` function is called to create an HTML document. The function returns a pointer to the newly created `lxb_html_document_t` structure. If the creation fails, it returns `NULL`, prompting an error message. -### Parsing HTML Chunks +### Beginning the Chunk Parsing Process -The HTML content is stored in a two-dimensional array of characters. Each string -represents a fragment of the HTML document. The fragments are designed to be -combined later to form a complete HTML structure. +After the document is initialized, the parsing process begins with the following lines: ```c -static const lxb_char_t html[][64] = { - "", "", "HTML chun", - "ks parsing", "
", "good for me", "
", "\0" -}; +status = lxb_html_document_parse_chunk_begin(document); +if (status != LXB_STATUS_OK) { + FAILED("Failed to parse HTML"); +} ``` -After setting up the document, the code initiates the parsing process by calling -`lxb_html_document_parse_chunk_begin()`, which prepares the document to accept -incoming chunks of HTML. +The function `lxb_html_document_parse_chunk_begin()` prepares the document object for incremental parsing. It initializes the necessary internal structures and state, ensuring that the document is ready to accept chunks of HTML data. Handling the `LXB_STATUS_OK` status ensures the operation is successful. -```c -status = lxb_html_document_parse_chunk_begin(document); -``` +### Feeding HTML Chunks to the Parser -The program then enters a loop that iterates over each HTML chunk until it -reaches a null-terminating character. For each chunk, it prints the chunk -content and attempts to parse it using `lxb_html_document_parse_chunk()`. This -function takes the current HTML chunk and its length as input, returning a -status that indicates success or failure. +The code then iterates through an array of HTML chunks, feeding each one to the parser: ```c for (size_t i = 0; html[i][0] != '\0'; i++) { PRINT("%s", (const char *) html[i]); - - status = lxb_html_document_parse_chunk(document, html[i], + + status = lxb_html_document_parse_chunk(document, html[i], strlen((const char *) html[i])); if (status != LXB_STATUS_OK) { FAILED("Failed to parse HTML chunk"); @@ -77,14 +46,11 @@ for (size_t i = 0; html[i][0] != '\0'; i++) { } ``` -If any chunk fails to parse correctly, the program will exit with an error -message. +In this loop, each element of the `html` array represents a chunk of the HTML document. The `lxb_html_document_parse_chunk()` function is called with three arguments: the document, the current chunk, and the chunk's length. This function parses each chunk and updates the document's state accordingly. The code also prints each chunk before parsing it, providing a trace of the incoming data. -### Finalization +### Completing the Chunk Parsing Process -After processing all HTML chunks, the end of the parsing process is signaled -with the call to `lxb_html_document_parse_chunk_end()`. This function finalizes -the parsing operation and validates the final structure of the document. +Once all chunks are processed, the code completes the parsing process: ```c status = lxb_html_document_parse_chunk_end(document); @@ -93,31 +59,37 @@ if (status != LXB_STATUS_OK) { } ``` -### Printing Results +The `lxb_html_document_parse_chunk_end()` function finalizes the incremental parsing process. It ensures that any remaining parsing tasks are completed and the document structure is properly built. -Once parsing is complete, the example demonstrates how to serialize the -resulting HTML DOM tree using the `serialize()` function, allowing the user to -see the structured representation of the parsed HTML content. +### Serialization of the HTML Document Tree + +The next section serializes and prints the parsed HTML document tree: ```c PRINT("\nHTML Tree:"); serialize(lxb_dom_interface_node(document)); ``` -### Cleanup +The `serialize()` function, though not defined in this snippet, presumably converts the internal document tree into a human-readable format and prints it. The `lxb_dom_interface_node()` function provides an interface to the document's root node, which `serialize()` then processes. + +### Destruction of the HTML Document -Finally, the document is destroyed using `lxb_html_document_destroy()`, which -frees the allocated memory associated with the HTML document instance. This -resource management step is crucial in avoiding memory leaks. +Finally, the document object is destroyed to free allocated resources: ```c lxb_html_document_destroy(document); ``` -## Conclusion +This function ensures that all memory and resources associated with the document object are appropriately released, preventing memory leaks. + +## Notes + +- **Chunk Parsing**: This example shows a common approach for handling streaming data by breaking it into manageable chunks. +- **Error Handling**: The code checks the status after every parsing function call, ensuring robust error detection and messaging. +- **Resource Management**: Proper creation and destruction of objects ensure efficient use of memory resources. + +## Summary + +This example demonstrates the intermediate-to-advanced use of the `lexbor` library for parsing HTML content incrementally. By initializing a document, processing it in chunks, finalizing the parse, and printing the result, users can handle large or streaming HTML data efficiently. This pattern is crucial for applications that need to process data as it arrives, such as web crawlers or real-time data analytics systems. -This example effectively illustrates how to use `lexbor` for HTML document parsing -in a chunked manner. The structure and logic of the code provide a solid -foundation for more advanced HTML processing applications. It encapsulates -essential operations such as initialization, incremental parsing, result -extraction, and cleanup in a clear and easy-to-follow manner. \ No newline at end of file +Understanding this example provides a solid foundation for leveraging `lexbor` in complex, data-intensive applications. \ No newline at end of file diff --git a/source/examples/html/document_title.md b/source/examples/html/document_title.md index 0c72fbf..d807bc0 100644 --- a/source/examples/html/document_title.md +++ b/source/examples/html/document_title.md @@ -1,31 +1,18 @@ -# HTML Document Title Example +# Manipulating HTML Document Title: Example -This article will explain the functionality of the HTML document title example -implemented in the source code found in -[lexbor/html/document_title.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/document_title.c). -The purpose of this code is to demonstrate how to parse an HTML string, retrieve -its title, modify the title, and then display the resulting HTML document -structure using the `lexbor` library. +This article provides an in-depth explanation of the example code in +`lexbor/html/document_title.c`, which demonstrates how to work with HTML +document titles using the `lexbor` library. The code illustrates initializing a +document, parsing an HTML string, extracting and modifying the title, and +printing the tree structure before and after the change. -## Code Breakdown +## Key Code Sections -### Initialization +### Initializing the HTML Document -The code begins with the inclusion of the required headers and the setup of the -`main` function, which is the entry point of the program. Here, the main task -involves creating an HTML document instance and specifying the necessary -variables. - -```c -lxb_html_document_t *document; -``` -This line declares a pointer to an `lxb_html_document_t` structure which -represents the HTML document being created. The succeeding lines define -variables for storing the title and its length. - -### Creating the Document - -The next significant step is the initialization of the HTML document: +The first critical step in the example is the creation of an HTML document +object. This object will represent the entire HTML structure that the lexbor +library manages. ```c document = lxb_html_document_create(); @@ -33,14 +20,14 @@ if (document == NULL) { FAILED("Failed to create HTML Document"); } ``` -In this snippet, the `lxb_html_document_create` function is called to allocate -memory for a new HTML document. If the document fails to create, the program -invokes the `FAILED` macro to signal an error. -### Parsing HTML +Here, the function `lxb_html_document_create` is used to allocate and +initialize a new `lxb_html_document_t` structure. If the initialization fails, +the program will print an error message and terminate. + +### Parsing the HTML String -After successfully creating the document, the code proceeds to parse the HTML -string: +Once the document is created, the example code parses a provided HTML string. ```c status = lxb_html_document_parse(document, html, html_len); @@ -48,36 +35,46 @@ if (status != LXB_STATUS_OK) { FAILED("Failed to parse HTML"); } ``` -Here, the HTML content defined in the `html` array—specifically the title tag -which contains extra spaces—is parsed. The variable `status` checks if the -operation was successful. If not, the program exits with an error message. -### Retrieving the Title +The `lxb_html_document_parse` function takes the document object and the HTML +string along with its length to populate the document with the appropriate +nodes and structure. Proper error handling is shown to ensure that parsing +completes successfully. -Once the document is parsed, the code retrieves the title of the document: +### Retrieving the Document Title + +The example demonstrates two methods for retrieving the document title: +formatted and raw. ```c title = lxb_html_document_title(document, &title_len); -``` -This function call extracts the title text from the document, storing it into -the `title` variable. The length of the title is also provided through the -`title_len` reference. The subsequent `if` statement checks whether the title -exists, printing the title or an empty message accordingly. - -### Obtaining the Raw Title +if (title == NULL) { + PRINT("\nTitle is empty"); +} +else { + PRINT("\nTitle: %s", title); +} -The following code retrieves the raw title, which includes the original -formatting (e.g., extra spaces): +... -```c title = lxb_html_document_title_raw(document, &title_len); +if (title == NULL) { + PRINT("Raw title is empty"); +} +else { + PRINT("Raw title: %s", title); +} ``` -Much like the previous title retrieval, this extracts the unformatted title, -allowing a comparison between the cleaned and raw titles. -### Modifying the Title +The `lxb_html_document_title` function retrieves the title after trimming +whitespace and normalizing spaces. Conversely, `lxb_html_document_title_raw` +returns the title exactly as it appears in the document, preserving all +original formatting and whitespace. + +### Modifying the Document Title -The code then demonstrates how to change the document's title: +Next, the example code changes the document title to a new value provided by +`new_title`. ```c status = lxb_html_document_title_set(document, new_title, new_title_len); @@ -85,37 +82,58 @@ if (status != LXB_STATUS_OK) { FAILED("Failed to change HTML title"); } ``` -By invoking `lxb_html_document_title_set`, the title is altered to a new value -defined by the `new_title` variable. An error check follows to ensure the title -change was successful. -### Displaying the New Title and HTML Structure +Here, the `lxb_html_document_title_set` function is called with the new title +and its length. This function updates the document's title element, and error +handling ensures the operation completes successfully. -The final steps involve displaying the updated title and the entire HTML -document structure after modification: +### Serializing and Printing the HTML Tree + +After modifying the title, the example prints the document's tree structure +before and after the title change. ```c -title = lxb_html_document_title(document, &title_len); +PRINT("HTML Tree: "); +serialize(lxb_dom_interface_node(document)); + +... + +PRINT("\nHTML Tree after change title: "); +serialize(lxb_dom_interface_node(document)); ``` -This repeats the earlier title retrieval process to print the new title. -Finally, the code prints the altered HTML structure to show the impact of the -title change. -### Cleanup +The `serialize` function is used to output the tree structure, showing all +nodes and their relationships. This helps visualize the changes made to the +document. + +### Cleaning Up -Lastly, the document is destroyed to free the allocated memory, which is crucial -for preventing memory leaks: +Finally, the code cleans up by destroying the document object, freeing any +resources allocated during its creation and manipulation. ```c lxb_html_document_destroy(document); ``` -## Conclusion +This is crucial to prevent memory leaks and ensure proper program termination. + +## Notes + +- **Error Handling**: Robust error handling ensures that each operation + (creation, parsing, modification) completes successfully or produces useful + output if it fails. +- **Title Retrieval vs. Raw Title**: The distinction between normalizing + whitespaces in the title versus retrieving it as-is can be important for + different application needs. +- **Resource Management**: Proper allocation and deallocation of resources are + demonstrated to maintain program stability and efficiency. + +## Summary -This example illustrates the basic operations for handling HTML document titles -using the `lexbor` library, including parsing content, accessing and modifying the -title, and ensuring proper resource management. The structure of the code is -straightforward, aiming to provide a clear understanding of each step involved -in managing an HTML document's title. As developers familiarize themselves with -the functionalities offered by `lexbor`, they will be better equipped to -manipulate HTML content programmatically. \ No newline at end of file +In this example, we've explored the use of the `lexbor` library to manipulate an +HTML document's title. The code demonstrates document creation, HTML parsing, +title extraction, title modification, and tree serialization. Key takeaways +include understanding lexbor's various functions for title handling and the +importance of resource management and error handling. This example is a helpful +reference for developers looking to programmatically control HTML content using +lexbor. \ No newline at end of file diff --git a/source/examples/html/element_attributes.md b/source/examples/html/element_attributes.md index b02d503..e989e55 100644 --- a/source/examples/html/element_attributes.md +++ b/source/examples/html/element_attributes.md @@ -1,145 +1,144 @@ -# Element Attributes Example +# Handling Element Attributes with `lexbor`: Example -This article explains the implementation found in -[lexbor/html/element_attributes.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_attributes.c), -which demonstrates how to manipulate HTML element attributes using the `lexbor` -library. The example outlines parsing an HTML snippet, finding an element, and -performing various operations involving element attributes, such as adding, -checking existence, retrieving, modifying, and removing attributes from an -element. +This article explores the `lexbor/html/element_attributes.c` example, which demonstrates parsing an HTML document, manipulating DOM elements, and their attributes using the `lexbor` library. The example focuses on setting, getting, checking for existence, iterating over, changing, and finally removing attributes of a DOM element within a parsed HTML document. -## Code Overview +## Key Code Sections -The code begins by including necessary headers and defining the main function, -which initializes variables for handling the document and its components. The -use of `lxb_status_t` for tracking the status of operations is essential -throughout the code. - -### HTML Parsing - -The code defines a static HTML string: +### Parsing the HTML Document ```c static const lxb_char_t html[] = "
"; -``` +size_t html_len = sizeof(html) - 1; -A document is parsed from this HTML string with: - -```c +/* Parse */ document = parse(html, html_len); ``` -After parsing, the code outputs the structure of the DOM tree to the console -using a `serialize` function, allowing developers to visualize the parsed HTML -elements. - -### Collection Creation +The HTML document defined as a static string is parsed using the `parse` function that constructs an `lxb_html_document_t` object. This is the initial step, setting up the environment for further DOM manipulations. -Next, a DOM collection is created to hold references to found elements: +### Creating and Using a Collection ```c +/* Create Collection for elements */ collection = lxb_dom_collection_make(&document->dom_document, 16); +if (collection == NULL) { + FAILED("Failed to create collection"); +} ``` -If the collection creation fails, an error message is printed, and the program -exits. +A `lxb_dom_collection_t` is created to store elements found during searching. This is essential for working with multiple elements efficiently. The collection is initialized with a pre-defined capacity of 16 elements. -### Searching for Elements - -To find the `
` element in the DOM, the code first obtains the body element -and then calls: +### Finding and Accessing Elements ```c -status = lxb_dom_elements_by_tag_name(element, collection, (const lxb_char_t *) "div", 3); -``` +/* Get BODY element (root for search) */ +body = lxb_html_document_body_element(document); +element = lxb_dom_interface_element(body); + +/* Find DIV element */ +status = lxb_dom_elements_by_tag_name(element, collection, + (const lxb_char_t *) "div", 3); -This line searches for all `
` elements under the specified parent element. -A check for successful status and the collection's length follows, ensuring that -at least one `
` is found. +if (status != LXB_STATUS_OK || lxb_dom_collection_length(collection) == 0) { + FAILED("Failed to find DIV element"); +} +``` -### Adding an Attribute +Here, the `body` element serves as the root for the search. The `lxb_dom_elements_by_tag_name` function searches for all `div` tags and stores them in the collection. Error checks ensure that the `div` elements are found successfully. -Once the element is identified, a new attribute is added using: +### Setting and Appending Attributes ```c -attr = lxb_dom_element_set_attribute(element, name, name_size, (const lxb_char_t *) "oh God", 6); +attr = lxb_dom_element_set_attribute(element, name, name_size, + (const lxb_char_t *) "oh God", 6); +if (attr == NULL) { + FAILED("Failed to create and append new attribute"); +} ``` -In this case, the attribute named "my-name" is appended with a value of "oh -God." If the attribute creation fails, an error message is displayed. +A new attribute is appended to the `div` element using `lxb_dom_element_set_attribute`. The attribute name is "my-name" and its value is "oh God". The function creates the attribute if it doesn't already exist and appends it to the element. ### Checking Attribute Existence -The program checks if the newly added attribute exists: - ```c is_exist = lxb_dom_element_has_attribute(element, name, name_size); -``` -A printed message confirms whether the attribute is present or not based on the -check. +if (is_exist) { + PRINT("\nElement has attribute \"%s\": true", (const char *) name); +} +else { + PRINT("\nElement has attribute \"%s\": false", (const char *) name); +} +``` -### Retrieving Attribute Values +The `lxb_dom_element_has_attribute` checks whether the given attribute exists on the element. The result is printed accordingly. -The next operation retrieves the value of the specified attribute: +### Retrieving Attribute Value ```c value = lxb_dom_element_get_attribute(element, name, name_size, &value_len); -``` +if (value == NULL) { + FAILED("Failed to get attribute value by qualified name"); +} -If successful, it prints the value associated with the "my-name" attribute. +PRINT("Get attribute value by qualified name \"%s\": %.*s", + (const char *) name, (int) value_len, value); +``` -### Iterating Through Attributes +`lxb_dom_element_get_attribute` retrieves the value of the specified attribute. If the attribute is found, its value and length are returned and printed. This section shows how to access the values of element attributes. -The code then demonstrates how to iterate through all attributes of the element: +### Iterating Over Attributes ```c +/* Iterator */ +PRINT("\nGet element attributes by iterator:"); attr = lxb_dom_element_first_attribute(element); -``` - -This iterates through attributes using a `while` loop, printing each attribute's -name and value until there are no more attributes in the collection. -### Modifying an Attribute Value +while (attr != NULL) { + tmp = lxb_dom_attr_qualified_name(attr, &tmp_len); + printf("Name: %s", tmp); -To change the value of an existing attribute, the code retrieves the attribute -by name: + tmp = lxb_dom_attr_value(attr, &tmp_len); + if (tmp != NULL) { + printf("; Value: %s\n", tmp); + } + else { + printf("\n"); + } -```c -attr = lxb_dom_element_attr_by_name(element, name, name_size); + attr = lxb_dom_element_next_attribute(attr); +} ``` -Then, it updates the value to "new value" using: +Using an iterator, this section retrieves and prints all attributes of the element. `lxb_dom_element_first_attribute` gets the first attribute, and `lxb_dom_element_next_attribute` progresses through the list. + +### Changing Attribute Value ```c +attr = lxb_dom_element_attr_by_name(element, name, name_size); status = lxb_dom_attr_set_value(attr, (const lxb_char_t *) "new value", 9); +if (status != LXB_STATUS_OK) { + FAILED("Failed to change attribute value"); +} ``` -### Removing an Attribute +Changing an attribute's value involves first retrieving the attribute using `lxb_dom_element_attr_by_name` and then setting the value with `lxb_dom_attr_set_value`. Error checking ensures that the operation is successful. -Finally, the example concludes with the removal of the newly added attribute: +### Removing Attributes ```c +/* Remove new attribute by name */ lxb_dom_element_remove_attribute(element, name, name_size); ``` -This operation is followed by a serialized output of the DOM tree again, -allowing the developer to observe changes. +The final operation removes the specified attribute from the element using `lxb_dom_element_remove_attribute`. This demonstrates the library's capabilities for cleaning up or updating the DOM. -### Cleanup +## Notes -The code ensures proper resource management by destroying the collection and the -document at the end of the main function to prevent memory leaks: - -```c -lxb_dom_collection_destroy(collection, true); -lxb_html_document_destroy(document); -``` +- Proper error handling is crucial when manipulating the DOM to ensure robust and predictable behavior. +- Iterating over attributes can provide useful insights into the current state of an element's attributes, useful for debugging or further manipulation. +- Changing and removing attributes dynamically allows for flexible DOM updates. -## Conclusion +## Summary -The `element_attributes.c` example illustrates fundamental operations in DOM -manipulation provided by the `lexbor` library. The code efficiently demonstrates -how to parse HTML, locate and manipulate elements, manage attributes, and ensure -appropriate cleanup of resources, making it a valuable reference for web -developers working with the `lexbor` framework. \ No newline at end of file +This example demonstrates how to create, manipulate, and manage element attributes using the `lexbor` library, covering parsing HTML, finding elements, setting, retrieving, iterating over, changing, and removing attributes. These operations form the basis for extensive DOM manipulations in web development and highlight the power and flexibility of `lexbor` for such tasks. Understanding these fundamentals is essential for effectively utilizing `lexbor` in complex web applications. \ No newline at end of file diff --git a/source/examples/html/element_create.md b/source/examples/html/element_create.md index cc641f8..de37564 100644 --- a/source/examples/html/element_create.md +++ b/source/examples/html/element_create.md @@ -1,56 +1,119 @@ -# HTML Element Creation Example - -This article explains the implementation of creating and appending HTML elements -in a document using the respective `lexbor` library. The example provided is from -the source file -[lexbor/html/element_create.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_create.c). - -## Introduction - -The code demonstrates how to initialize an HTML document, create various HTML -elements using their tag IDs, and manage them within a document structure. The -main functionalities utilized include parsing an empty HTML document, creating -elements, and preserving the overall tree structure through serialization. - -## Code Overview - -1. **Initialization**: The code begins with the necessary includes and the - definition of the `main` function. It declares necessary pointers to hold the - document, body element, and tags. - -2. **Parse Document**: The function `parse` is called with an empty string, - initializing an HTML document. This is essential for setting up a base where - elements can be created and manipulated. - -3. **Accessing the Body Element**: The body of the document is obtained using - `lxb_html_document_body_element(document)`, allowing further manipulations to - be performed on this node. - -4. **Creating Elements**: A loop iterates over all tag IDs defined by the `lexbor` - library, from `LXB_TAG_A` to `LXB_TAG__LAST_ENTRY`. For each tag: - - The tag name is retrieved using `lxb_tag_name_by_id`. - - An element is created with `lxb_dom_document_create_element`. This function - constructs the DOM element based on the tag name. - - If the tag is identified as void (such as `
` or ``), it is created - without a text node. Conversely, non-void tags generate text nodes through - `lxb_dom_document_create_text_node`, allowing text content to be associated - with those elements. - -5. **Inserting Elements into the Tree**: Each created element is serialized for - output and then inserted into the body of the document using - `lxb_dom_node_insert_child`. - -6. **Final Output**: After all elements are created and appended, the updated - document tree is printed to show the result of the insertions. - -7. **Cleanup**: Finally, the allocated document is destroyed using - `lxb_html_document_destroy` to prevent memory leaks. - -## Conclusion - -This program effectively showcases the process of dynamically creating HTML -elements using the `lexbor` library. It covers the aspects of parsing, element -creation, manipulation, and serialization, providing an essential toolkit for -developers looking to work with HTML structures programmatically. The inclusion -of error handling ensures reliability, allowing developers to catch and address -potential issues during element creation. \ No newline at end of file +# HTML Element Creation and Traversal: Example + +In this example, sourced from the `lexbor/html/element_create.c` file, we will +delve into creating and manipulating HTML elements using the `lexbor` library. +This article provides a deep dive into the code, explaining how to dynamically +create every standardized HTML element, insert them into the document tree and +serialize the current structure. This example is pivotal for those seeking to +comprehend the intricacies of DOM manipulation with lexbor. + +## Key Code Sections + +### Initial Document Parsing + +First, we see the creation and initialization of an HTML document. + +```c +document = parse((const lxb_char_t *) "", 0); +body = lxb_html_document_body_element(document); +``` + +The `parse` function initializes an empty HTML document. The subsequent +call to `lxb_html_document_body_element` retrieves the body element of +the document. + +### Initial HTML Tree Serialization + +To observe the initial state of the HTML document, the code serializes and +prints the document. + +```c +PRINT("Inital HTML Tree:"); +serialize(lxb_dom_interface_node(document)); +printf("\n"); +``` + +Here, the `serialize` function outputs the current structure of the document +tree, which is initially empty. + +### Creating and Inserting HTML Elements + +Next, the code iterates over all known HTML tag IDs and creates corresponding +elements. + +```c +for (tag_id = LXB_TAG_A; tag_id < LXB_TAG__LAST_ENTRY; tag_id++) +{ + tag_name = lxb_tag_name_by_id(tag_id, &tag_name_len); + // Error handling omitted for brevity + + element = lxb_dom_document_create_element(&document->dom_document, + tag_name, tag_name_len, NULL); + // Error handling omitted for brevity + + if (lxb_html_tag_is_void(tag_id)) { + // Handling void elements + } + else { + text = lxb_dom_document_create_text_node(&document->dom_document, + tag_name, tag_name_len); + // Error handling omitted for brevity + + lxb_dom_node_insert_child(lxb_dom_interface_node(element), + lxb_dom_interface_node(text)); + } + serialize_node(lxb_dom_interface_node(element)); + lxb_dom_node_insert_child(lxb_dom_interface_node(body), + lxb_dom_interface_node(element)); +} +``` + +In this loop: + +1. `lxb_tag_name_by_id` retrieves the tag name associated with `tag_id`. +2. `lxb_dom_document_create_element` creates an element node for the tag. +3. If the tag is not a void element (based on the specification), a text node + with the tag name is created and appended as a child to the element. +4. `serialize_node` outputs the newly created element. +5. Finally, the element is appended to the body of the document. + +### Final HTML Tree Serialization + +After all elements are created and inserted into the document, the resulting +HTML structure is serialized and printed. + +```c +PRINT("\nTree after create elements:"); +serialize(lxb_dom_interface_node(document)); +``` + +This section provides a clear view of how the document looks after all +operations. + +### Document Cleanup + +Proper resource management is crucial. The example concludes by destroying +the document to free up memory. + +```c +lxb_html_document_destroy(document); +``` + +## Notes + +- **Document Initialization**: Creating an empty document and retrieving the body + element is fundamental for subsequent operations. +- **Element Creation**: Iterating through tag IDs systematically to create all + HTML elements showcases lexbor's comprehensive coverage of HTML tags. +- **Void Elements Handling**: Differentiation between void and non-void elements + is essential to comply with HTML specifications. +- **Serialization**: The serialization function is valuable for debugging and + inspecting the document structure. + +## Summary + +This example demonstrates the power and flexibility of the `lexbor` library for +HTML document manipulation. It covers essential operations such as parsing, +element creation, and serialization, and highlights best practices like resource +management and adherence to HTML specifications. Understanding this example is +crucial for anyone looking to effectively use lexbor for DOM manipulation tasks. \ No newline at end of file diff --git a/source/examples/html/element_innerHTML.md b/source/examples/html/element_innerHTML.md index 3f3bf7f..318ffe3 100644 --- a/source/examples/html/element_innerHTML.md +++ b/source/examples/html/element_innerHTML.md @@ -1,38 +1,26 @@ -# Setting innerHTML Example +# Setting `innerHTML` Property in Lexbor: Example -This article will explain the `innerHTML` manipulation in the context of the -`lexbor` HTML parser, as illustrated in the source file -[lexbor/html/element_innerHTML.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/element_innerHTML.c). -This example demonstrates how to parse HTML content, modify an element's inner -HTML, and serialize the result. +This example in the file `lexbor/html/element_innerHTML.c` demonstrates how to use the `lexbor` library to parse an HTML document, set the `innerHTML` of a body element, and serialize the resulting DOM tree. The intent of this code is to highlight key operations in manipulating the DOM using `lexbor`, such as document parsing, element selection, and updating the DOM tree. -## Code Overview +## Key Code Sections -The code starts with the inclusion of the necessary header file, `base.h`, which -likely contains the essential definitions and functions for the `lexbor` library. -The `main` function serves as the entry point for the execution of this program. +### Parsing HTML Document -### HTML Parsing - -The program begins by defining a simple HTML string containing a `
` with a -nested `` element. The length of this string is calculated using -`sizeof(html) - 1` to exclude the null terminator from the count. The predefined -HTML string is as follows: +First, we start by parsing the initial HTML document. The `parse` function reads the HTML string and constructs the corresponding DOM tree. ```c static const lxb_char_t html[] = "
blah-blah-blah
"; +size_t html_len = sizeof(html) - 1; + +/* Parse */ +document = parse(html, html_len); ``` -Next, the `parse` function is called with the HTML string and its length. This -function processes the HTML and generates a document object model (DOM), -representing the structure of the HTML document in memory. +Here, `html` contains our initial HTML code. `html_len` determines the length of this string (excluding the null terminator). Then, the `parse` function returns a `document` representing our HTML document. -### Printing the Parsed HTML +### Printing the Parsed Document -The program checks the output of the `parse` function and prints the original -HTML and the resulting DOM tree. This is accomplished with the `PRINT` macro, -which appears to be a utility for outputting messages. The serialized DOM is -obtained using the `serialize` function on the document's root node: +Next, the parsed HTML document is printed for verification. ```c PRINT("HTML:"); @@ -41,45 +29,64 @@ PRINT("\nTree after parse:"); serialize(lxb_dom_interface_node(document)); ``` -### Inner HTML Modification +This section outputs the original HTML string and the serialized DOM tree after parsing. The `serialize` function converts the DOM tree back to a string and prints it for inspection. + +### Obtaining the `body` Element -Subsequently, a second HTML string is defined, which will be set as the inner -HTML of the body element. This inner HTML is specified as follows: +After parsing, we obtain the `body` element from the document for further manipulation. ```c -static const lxb_char_t inner[] = "
  • 1
  • 2
  • 3
"; +/* Get BODY element */ +body = lxb_html_document_body_element(document); ``` -The program retrieves the body element of the document using -`lxb_html_document_body_element(document)`. The inner HTML of the body is then -set using the `lxb_html_element_inner_html_set` function, which takes the body -element and the inner HTML string along with its length as arguments: +This retrieves the `body` element of the parsed document, which is required to set its `innerHTML`. + +### Setting Inner HTML + +We then set the `innerHTML` of the `body` element to a new HTML string. ```c +static const lxb_char_t inner[] = "
  • 1
  • 2
  • 3
"; +size_t inner_len = sizeof(inner) - 1; + element = lxb_html_element_inner_html_set(lxb_html_interface_element(body), inner, inner_len); +if (element == NULL) { + FAILED("Failed to parse innerHTML"); +} ``` -If the `element` is `NULL`, indicating a failure in setting the inner HTML, a -failure message is printed through the `FAILED` macro. +Here, `inner` contains the new HTML to be set as the `innerHTML` of the `body` element. `inner_len` gives the length of this string. The `lxb_html_element_inner_html_set` function updates the `innerHTML` of the targeted element. An error is reported if the function fails. -### Final Output +### Printing the Updated Document -After setting the inner HTML, the program serializes the modified DOM tree and -prints the result. This demonstrates the changes made by the inner HTML -operation. Finally, the code cleans up by destroying the document to release -resources allocated for the DOM. +Finally, the modified DOM tree is serialized and printed. ```c PRINT("\nTree after innerHTML set:"); serialize(lxb_dom_interface_node(document)); +``` + +This helps verify that the new `innerHTML` has been correctly applied to the `body` element. + +### Cleaning Up + +The last step is to clean up and free the allocated memory for the document. + +```c +/* Destroy all */ lxb_html_document_destroy(document); ``` -## Conclusion +This ensures that all resources used by the document are properly released. + +## Notes + +- The `parse` function is expected to correctly handle the input HTML and generate a DOM tree. +- The function `lxb_html_element_inner_html_set` is used to set the `innerHTML` of an element and returns the modified element or `NULL` if an error occurs. +- Using `serialize` to print the DOM tree before and after modification is a good practice to verify changes made to the DOM. + +## Summary -The example provided illustrates how to parse an HTML string, modify an -element's inner HTML content, and serialize the resulting DOM structure using -`lexbor`'s capabilities. This demonstrates an essential functionality often used -in web development for DOM manipulation, showcasing the ease of use of the -`lexbor` library for such tasks. \ No newline at end of file +This example demonstrates the essential steps for manipulating an HTML document using the `lexbor` library: parsing the document, selecting elements, updating the `innerHTML`, and serializing the DOM tree. By following this process, developers can effectively manage the DOM structure of HTML documents using `lexbor`. \ No newline at end of file diff --git a/source/examples/html/elements_by_attr.md b/source/examples/html/elements_by_attr.md index 4171242..5f2d032 100644 --- a/source/examples/html/elements_by_attr.md +++ b/source/examples/html/elements_by_attr.md @@ -1,148 +1,113 @@ -# Retrieving Elements by Attribute Example +# Extracting Elements by Attribute: Example -This article will explain the functionality and implementation of the code found -in **lexbor/html/elements_by_attr.c**, which demonstrates how to retrieve DOM -elements based on specific attributes using the `lexbor` library. +The file `lexbor/html/elements_by_attr.c` demonstrates how to use the `lexbor` library to extract and manipulate HTML elements based on their attributes. This example illustrates a range of selection techniques, including full match, starts with, ends with, and contains. Here, we will provide an in-depth explanation of the key sections within this code to better understand its functionality. -## Overview +## Key Code Sections -The provided code showcases how to extract elements from an HTML document based -on their attributes. It specifically focuses on obtaining elements by 'class' -and 'href' attributes, employing methods that match, search from the beginning, -and search from the end of the attribute values. +### Initialization and Parsing -## Code Breakdown - -### Including Necessary Headers - -The code starts with including essential headers: +The example starts by initializing required variables and parsing the HTML document. ```c -#include "base.h" -#include +lxb_html_document_t *document; +const lxb_char_t html[] = "
" + "
" + "
" + "ref" + "
"; +size_t html_size = sizeof(html) - 1; + +document = parse(html, html_size); +body = lxb_dom_interface_element(document->body); ``` -The `base.h` header seems to contain definitions and functions crucial for this -example, while `lexbor/dom/dom.h` provides the necessary DOM manipulations for -lexbor. +The `parse` function converts the raw HTML string into a structured `document` that `lexbor` can process. The `lxb_dom_interface_element` call retrieves the body element from the document for further manipulation. -### Print Collection Function +### Creating the Collection -The function `print_collection_elements` is defined to handle the output of the -retrieved elements: +Next, the code creates a collection object to hold the selected elements. ```c -static void print_collection_elements(lxb_dom_collection_t *collection) +collection = lxb_dom_collection_make(&document->dom_document, 128); +if (collection == NULL) { + FAILED("Failed to create Collection object"); +} ``` -This function loops through the elements within the provided collection using -its length and utilizes the `serialize_node` function to print each element. -After processing, it ensures to clean up the collection to prevent memory leaks. +By calling `lxb_dom_collection_make`, a new collection is created with an initial capacity of 128 elements. This collection will be reused for different attribute selection methods. -### Main Function Execution +### Full Match Selection -The `main` function is where the key processes occur: +This section demonstrates how to select elements by an exact attribute match. ```c -int main(int argc, const char *argv[]) +status = lxb_dom_elements_by_attr(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "red c++ best", 12, + true); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +PRINT("\nFull match by 'red c++ best':"); +print_collection_elements(collection); ``` -#### Parsing HTML +The `lxb_dom_elements_by_attr` function is used here to find elements with the `class` attribute exactly matching "red c++ best." The result is stored in the `collection`. -The HTML content is defined statically: +### Begin-Match Selection ```c -const lxb_char_t html[] = "
" -"
" -"
" -"ref" -"
"; +status = lxb_dom_elements_by_attr_begin(body, collection, + (const lxb_char_t *) "href", 4, + (const lxb_char_t *) "http", 4, + true); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +PRINT("\nFrom begin by 'http':"); +print_collection_elements(collection); ``` -This string contains several `
` and `` tags with diverse class -attributes and an `href`. The length of this HTML string is then calculated. - -#### Creating Document and Collection +In this snippet, `lxb_dom_elements_by_attr_begin` selects elements where the `href` attribute starts with "http". This demonstrates the flexibility of attribute-based selection. -Following that, the HTML is parsed, creating a document object: +### End-Match Selection ```c -document = parse(html, html_szie); +status = lxb_dom_elements_by_attr_end(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "grep", 4, + true); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +PRINT("\nFrom end by 'grep':"); +print_collection_elements(collection); ``` -Next, a collection object is created that will hold the elements found based on -the attribute queries: +The `lxb_dom_elements_by_attr_end` function selects elements where the `class` attribute ends with "grep." + +### Contain-Match Selection ```c -collection = lxb_dom_collection_make(&document->dom_document, 128); +status = lxb_dom_elements_by_attr_contain(body, collection, + (const lxb_char_t *) "class", 5, + (const lxb_char_t *) "c++ b", 5, + true); +if (status != LXB_STATUS_OK) { + FAILED("Failed to get elements by name"); +} +PRINT("\nContain by 'c++ b':"); +print_collection_elements(collection); ``` -A check is performed to ensure that the collection was created successfully. - -#### Searching Elements by Attributes - -The program performs several searches: - -1. **Full Match:** Using `lxb_dom_elements_by_attr`, it searches for elements - with the exact class `red c++ best`: - - ```c - status = lxb_dom_elements_by_attr(body, collection, - (const lxb_char_t *) "class", 5, - (const lxb_char_t *) "red c++ best", 12, - true); - ``` +Lastly, `lxb_dom_elements_by_attr_contain` is used to find elements with `class` attributes containing "c++ b." - If the search is successful, the found elements are printed. - -2. **From Beginning:** The code retrieves elements with an `href` that starts - with `http`: - - ```c - status = lxb_dom_elements_by_attr_begin(body, collection, - (const lxb_char_t *) "href", 4, - (const lxb_char_t *) "http", 4, - true); - ``` - -3. **From End:** This search targets elements with classes ending in `grep`: - - ```c - status = lxb_dom_elements_by_attr_end(body, collection, - (const lxb_char_t *) "class", 5, - (const lxb_char_t *) "grep", 4, - true); - ``` - -4. **Contain:** Finally, it looks for elements where the class contains the - substring `c++ b`: - - ```c - status = lxb_dom_elements_by_attr_contain(body, collection, - (const lxb_char_t *) "class", 5, - (const lxb_char_t *) "c++ b", 5, - true); - ``` - -Each of these searches utilizes the collection to retrieve relevant elements, -printing them as they are found. - -#### Cleanup - -After the searches, cleanup processes are executed to free the allocated -resources: - -```c -lxb_dom_collection_destroy(collection, true); -lxb_html_document_destroy(document); -``` +## Notes -This is critical for maintaining memory hygiene in C programs. +- The `print_collection_elements` function efficiently serializes and prints the details of the selected elements. +- The collection is cleaned after each selection to prepare it for the next usage. +- Error handling ensures that failures in creating the collection or selecting elements are reported. -## Conclusion +## Summary -This code snippet demonstrates how to efficiently query and manipulate DOM -elements in an HTML document using the `lexbor` library. By utilizing various -search strategies based on attributes, developers can effectively streamline -their DOM interactions, showcasing the flexibility and power of the lexbor -library for handling HTML content. \ No newline at end of file +This example showcases various techniques to select HTML elements by attributes using the `lexbor` library. By understanding how to utilize functions like `lxb_dom_elements_by_attr`, `lxb_dom_elements_by_attr_begin`, `lxb_dom_elements_by_attr_end`, and `lxb_dom_elements_by_attr_contain`, developers can effectively manipulate and query HTML documents based on specific attribute criteria. This is essential for tasks involving web scraping, data extraction, and document manipulation. \ No newline at end of file diff --git a/source/examples/html/elements_by_class_name.md b/source/examples/html/elements_by_class_name.md index ff3b01f..7d8e904 100644 --- a/source/examples/html/elements_by_class_name.md +++ b/source/examples/html/elements_by_class_name.md @@ -1,19 +1,19 @@ -# Getting Elements by Class Name Example +# Querying Elements by Class Name: Example -In this article, we will explore the implementation details and functionality of -the `elements_by_class_name` example, found in the -[lexbor/html/elements_by_class_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_class_name.c) -source file. The code demonstrates how to parse an HTML string and retrieve -elements with a specific class name using the `lexbor` library. This example is -essential for developers seeking to manipulate and query DOM elements in a -structured manner. +File: `lexbor/html/elements_by_class_name.c` -## Overview +This example demonstrates how to use the `lexbor` library to parse an HTML +document and retrieve all elements with a specific class name. The example +focuses on finding elements with the class name `"best"` from a given HTML +string and serializing them for output. -The `main` function begins by initializing variables, including `status`, -`element`, `document`, and `collection`. It then assigns an HTML string to the -`html` variable, which contains multiple `
` elements with various class -names. The length of the HTML string is calculated and stored in `html_size`. +## Key Code Sections + +### Parsing the HTML Document + +The first step involves parsing a hard-coded HTML string into a +`lxb_html_document_t` object that can be manipulated through the `lexbor` +library. ```c const lxb_char_t html[] = "
" @@ -21,25 +21,19 @@ const lxb_char_t html[] = "
" "
" "
"; -size_t html_size = sizeof(html) - 1; -``` - -## Parsing the HTML Document +size_t html_szie = sizeof(html) - 1; -Next, the code invokes the `parse` function to parse the HTML string and create -a DOM document. This document serves as the basis for subsequent operations on -the DOM elements contained within the HTML. - -```c -document = parse(html, html_size); +document = parse(html, html_szie); ``` -## Creating a Collection for DOM Elements +Here, the HTML string contains multiple `
` elements with different class +names. The `parse` function is used to convert this HTML string into a +`document` object, which can then be queried. + +### Creating a Collection -Once the document is obtained, the next step is to create a collection to hold -the elements retrieved by class name. The `lxb_dom_collection_make` function is -called with the document's DOM and an initial capacity of 128. If the collection -cannot be created, an error message is triggered. +To store the elements that match a specific query, a collection object is +created using `lxb_dom_collection_make`. ```c collection = lxb_dom_collection_make(&document->dom_document, 128); @@ -48,12 +42,13 @@ if (collection == NULL) { } ``` -## Retrieving Elements by Class Name +The collection is initialized with a capacity of 128 elements, a reasonable +default size for various use cases. -The `lxb_dom_elements_by_class_name` function enables the search for elements -with a specified class name. In this instance, it looks for elements with the -class name "best". The function leverages the interface of the document's body -to initiate the retrieval process and populate the `collection`. +### Querying by Class Name + +The core functionality of this example is querying the parsed document by a +specific class name using `lxb_dom_elements_by_class_name`. ```c status = lxb_dom_elements_by_class_name(lxb_dom_interface_element(document->body), @@ -63,21 +58,15 @@ if (status != LXB_STATUS_OK) { } ``` -After ensuring the retrieval is successful, the code proceeds to print the -original HTML and details about the found elements. - -```c -PRINT("HTML:"); -PRINT("%s", (const char *) html); -PRINT("\nFind all 'div' elements by class name 'best'."); -PRINT("Elements found:"); -``` +Here, the function `lxb_dom_elements_by_class_name` is called with the root +element of the document's body, the collection to store results, the class name +`"best"` (as a `const lxb_char_t *`), and the length of the class name (which is +`4`). This function searches for all elements with the class name `"best"` and +stores them in the collection. -## Serializing and Printing Found Elements +### Serializing and Printing the Results -A loop iterates through the collection of found elements, invoking the -`serialize_node` function to output each element's details. This demonstrates -how easy it is to interact with the elements returned by the class name query. +Once the elements are found, they are iterated over and serialized for output. ```c for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { @@ -86,21 +75,34 @@ for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { } ``` -## Cleanup +Each element in the collection is retrieved using +`lxb_dom_collection_element` and passed to the `serialize_node` function, which +handles the process of serialization into a string format for printing. -Finally, the `collection` and `document` are cleaned up to free allocated -resources. This step is crucial for managing memory within the application, -especially when dealing with large or complex documents. +### Cleaning Up + +Finally, the collection and document are properly destroyed to free up memory. ```c lxb_dom_collection_destroy(collection, true); lxb_html_document_destroy(document); ``` -## Conclusion +## Notes + +- **Memory Management**: Proper memory management is crucial. Ensure that all + created objects are destroyed to prevent memory leaks. +- **Error Handling**: Always check the return status of functions, especially + those that create objects or perform searches, to handle errors gracefully. +- **Collection Size**: The initial size of the collection can be adjusted based + on the expected number of elements to optimize performance. + +## Summary -The `elements_by_class_name` example illustrates how to use the `lexbor` library -to parse HTML content, search for elements by class name, and efficiently manage -those elements. The critical sections of the code demonstrate proper document -handling, error management, and systematic cleanup, providing a solid foundation -for developers exploring DOM manipulation within C. \ No newline at end of file +This example illustrates how to effectively use the `lexbor` library for +searching and manipulating elements in an HTML document. By understanding how to +parse the document, query elements by class name, and handle them appropriately, +you can leverage `lexbor` for various web scraping or HTML manipulation tasks. +The key takeaway is the efficient and accurate way `lexbor` allows querying and +handling elements based on class names, showcasing its robust capabilities for +document object model manipulation. \ No newline at end of file diff --git a/source/examples/html/elements_by_tag_name.md b/source/examples/html/elements_by_tag_name.md index 259b679..3be41dc 100644 --- a/source/examples/html/elements_by_tag_name.md +++ b/source/examples/html/elements_by_tag_name.md @@ -1,138 +1,108 @@ -# HTML Elements by Tag Name Example +# Extracting Elements by Tag Name: Example -This article will explain the code found in the source file -[lexbor/html/elements_by_tag_name.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/elements_by_tag_name.c), -which demonstrates how to find and print HTML elements by their tag names using -the `lexbor` DOM library. +In this article, we will delve into the `lexbor/html/elements_by_tag_name.c` example, +which demonstrates how to extract HTML elements by their tag name using the `lexbor` +library. This specific example focuses on parsing an HTML snippet and then retrieving +all `
` elements from it. We will analyze the different sections of the code to +understand how `lexbor` functions and data types facilitate these operations. -## Code Overview +## Key Code Sections -The purpose of this example is to parse a simple HTML string and retrieve all -`
` elements from the parsed document. It achieves this by leveraging the -`lexbor` library's DOM capabilities to manage and manipulate the HTML document -structure. +### Parsing the HTML Document -## Main Function - -The entry point of the program is the `main` function, which begins by declaring -several variables essential for the parsing process: - -- `status` stores the success or failure status of various operations. -- `element` will point to the current HTML element being processed. -- `document` links to the HTML document that will be created from the parsed - input. -- `collection` is intended to hold the collection of elements found in the - document. - -### Parsing HTML - -The HTML string defined as: +The first significant step in the code is parsing an HTML document using the given +HTML content. ```c const lxb_char_t html[] = "
"; -``` - -represents a simple HTML fragment which contains two `
` elements and a -`` element. The size of the HTML string is determined next: - -```c -size_t html_size = sizeof(html) - 1; -``` - -This allows the program to recognize the length of the string without including -the null terminator. - -The `parse` function is then called to create a `document` from the HTML string: +size_t html_szie = sizeof(html) - 1; -```c -document = parse(html, html_size); +document = parse(html, html_szie); ``` -This function interprets the HTML and constructs a corresponding DOM structure. -The parsing outcome is crucial; it will dictate the next steps in the program. +The `parse` function takes the HTML content and its size to convert the string into +an `lxb_html_document_t` structure. This document represents the parsed HTML in +memory, allowing further manipulations. -### Creating a DOM Collection +### Creating and Initializing the Collection -A collection is created to hold the resulting nodes: +Next, we need a collection to store the elements we find. `lexbor` provides +mechanisms for creating and managing such collections efficiently. ```c collection = lxb_dom_collection_make(&document->dom_document, 128); -``` - -This function attempts to allocate memory for a collection that can store up to -128 DOM elements. If memory allocation fails, the program exits with an error -message: - -```c if (collection == NULL) { FAILED("Failed to create Collection object"); } ``` -### Retrieving Elements by Tag Name +Here, `lxb_dom_collection_make` initializes a collection with a preallocated size of +128 elements. If the creation fails, it returns `NULL`, prompting the program to +exit with an error message. -The critical operation of this example is retrieving `
` elements from the -document: +### Finding Elements by Tag Name + +The critical function `lxb_dom_elements_by_tag_name` performs the task of finding +all elements with a specific tag name. ```c status = lxb_dom_elements_by_tag_name(lxb_dom_interface_element(document->body), collection, (const lxb_char_t *) "div", 3); -``` - -Here, `lxb_dom_elements_by_tag_name` takes three parameters: -1. The reference to the body of the document. -2. The collection object to store the found elements. -3. The string `"div"` along with its length, specifying which tags to search - for. - -If the call is unsuccessful, it again exits with an error message: - -```c if (status != LXB_STATUS_OK) { FAILED("Failed to get elements by name"); } ``` -### Output the Found Elements +In this code snippet: +- `lxb_dom_interface_element(document->body)` converts the body of the document + into a generic element interface. +- `collection` is passed to store the found elements. +- The tag name `"div"` is specified along with its length, `3`. -The program then prints the initial HTML string and displays a message -indicating that it is about to list the found `
` elements: +If the function fails to find any elements, it returns a status other than +`LXB_STATUS_OK`. -```c -PRINT("HTML:"); -PRINT("%s", (const char *) html); -PRINT("\nFind all 'div' elements by tag name 'div'."); -PRINT("Elements found:"); -``` +### Iterating Over and Serializing Found Elements -The elements collected are iterated over and serialized for display: +Once the elements are found, we iterate over the collection and serialize each node +for display. ```c for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) { element = lxb_dom_collection_element(collection, i); + serialize_node(lxb_dom_interface_node(element)); } ``` -This loop retrieves each element from the collection by index and uses the -`serialize_node` function to output its representation. +We loop through each element in the collection, retrieve it using +`lxb_dom_collection_element`, and then serialize it for output using the +`serialize_node` function. ### Cleanup -Finally, memory allocated for the collection and the document is released: +Proper cleanup of allocated resources is crucial to avoid memory leaks. ```c lxb_dom_collection_destroy(collection, true); lxb_html_document_destroy(document); ``` -This ensures that there are no memory leaks after the program's execution is -complete. +Here, `lxb_dom_collection_destroy` releases the memory for the collection, and +`lxb_html_document_destroy` does the same for the document. + +## Notes + +- The example underscores the importance of checking return values for error + handling. +- It showcases the use of `lxb_dom_elements_by_tag_name` to query elements + efficiently. -## Conclusion +## Summary -This example serves as a practical demonstration of how to use the `lexbor` -library to parse HTML and find elements by tag name. By using functions from the -library's API, the code effectively processes a document and manages collections -of elements, showcasing the utility of the `lexbor` framework in web development -tasks. \ No newline at end of file +The `lexbor/html/elements_by_tag_name.c` example effectively demonstrates how to +parse an HTML document and extract elements by their tag name. Key takeaways include +the importance of proper initialization and error handling, as well as the +simplicity and power of the `lexbor` API for DOM manipulation tasks. This example is +an excellent starting point for developers looking to utilize the `lexbor` library +for web scraping or HTML processing tasks. \ No newline at end of file diff --git a/source/examples/html/encoding.md b/source/examples/html/encoding.md index 645619f..136e255 100644 --- a/source/examples/html/encoding.md +++ b/source/examples/html/encoding.md @@ -1,84 +1,100 @@ -# HTML Encoding Example +# Determining HTML Encoding: Example -This article provides an explanation for the HTML Encoding example found in the -file -[lexbor/html/encoding.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/encoding.c). -This program is designed to read an HTML file, determine its character encoding, -and print it out. The implementation utilizes the `lexbor` library, which offers -various functions to handle encoding. +The example code in `lexbor/html/encoding.c` demonstrates how to determine the encoding of an HTML file using the `lexbor` library. This example is particularly useful for understanding how to initialize the encoding mechanism and extract the encoding information from the HTML content. -## Overview - -The main function of the example handles command-line input, reads an HTML file, -and determines its encoding using the `lexbor` library. The code includes a -failure handling mechanism and a usage function to guide users on how to execute -the program properly. +In this example, the code performs several tasks to determine the HTML encoding. It initializes the HTML encoding detection system, reads the HTML file, and then identifies the encoding used in that file. This process is useful for web scraping, data extraction, and ensuring proper text rendering. The file in question is `lexbor/html/encoding.c`. ## Key Code Sections -### Error Handling Macro +### Main Function and Input Handling + +The program starts with the `main` function, which handles user input and delegates file reading and encoding detection. + +```c +int +main(int argc, const char *argv[]) +{ + size_t len; + lxb_char_t *html; + lxb_status_t status; + lxb_html_encoding_t em; + lxb_html_encoding_entry_t *entry; + + if (argc != 2) { + usage(); + exit(EXIT_SUCCESS); + } + + html = lexbor_fs_file_easy_read((lxb_char_t *) argv[1], &len); + if (html == NULL) { + FAILED(true, "Failed to read file: %s", argv[1]); + } + // ... rest of code ... +} +``` + +Here, the program expects a single argument: the path to the HTML file. It reads the file content using `lexbor_fs_file_easy_read`, which returns the file's content and length. + +### Encoding Initialization + +Next, the program initializes the encoding detection mechanism. + +```c +status = lxb_html_encoding_init(&em); +if (status != LXB_STATUS_OK) { + FAILED(false, "Failed to init html encoding"); +} +``` + +This part initializes the `lxb_html_encoding_t` structure. If initialization fails, the program exits with an error message. + +### Encoding Determination + +The core logic for determining the encoding follows. -The `FAILED` macro is a pivotal part of this code, providing a consistent way to -handle errors throughout the program. It takes two parameters: a boolean flag -`with_usage` and a variable number of arguments. If an error occurs, it prints -the provided error message to the standard error stream and, if requested, -displays the usage information before quitting the program. This helps keep the -code clean while managing multiple error points effectively. +```c +status = lxb_html_encoding_determine(&em, html, (html + len)); +if (status != LXB_STATUS_OK) { + goto failed; +} -### Command-Line Arguments +entry = lxb_html_encoding_meta_entry(&em, 0); +if (entry != NULL) { + printf("%.*s\n", (int) (entry->end - entry->name), entry->name); +} +else { + printf("Encoding not found\n"); +} +``` -In the `main` function, the program checks the number of command-line arguments -passed to it. If the argument count does not equal 2, the program calls the -`usage` function to provide instructions on how to execute the program correctly -and then exits. This ensures that users understand how to use the program before -any further processing occurs. +The function `lxb_html_encoding_determine` scans the HTML content to find any encoding declarations. If an encoding is found, it retrieves the encoding entry using `lxb_html_encoding_meta_entry` and prints the encoding name. -### Reading the HTML File +### Error Handling and Cleanup -The program reads the HTML file specified in the command-line argument using the -`lexbor_fs_file_easy_read` function. It stores the content in a dynamic array -and checks for successful reading. If the file cannot be read, it invokes the -`FAILED` macro with an appropriate error message, ensuring that the program does -not proceed with `NULL` data. +In case of errors, the program provides error messages and performs necessary cleanups. -### Initializing HTML Encoding +```c +lexbor_free(html); +lxb_html_encoding_destroy(&em, false); -The core logic for handling character encoding begins with the initialization of -the `lxb_html_encoding_t` struct via the `lxb_html_encoding_init` function. This -struct is essential for managing encoding data throughout the program. If -initialization fails, the program handles the error gracefully using the -`FAILED` macro again. +return 0; -### Determining Encoding +failed: -The most crucial part of the program is determining the HTML encoding with the -`lxb_html_encoding_determine` function. This function analyzes the passed HTML -data to determine its encoding. In the previous comment section, there is a -mention of a 1024-byte limit, which reflects a common optimization practice -where a program doesn't need to read the entire file if a meta encoding tag is -typically found within the first 1024 bytes. However, this section is commented -out, meaning the program currently reads the complete content. +lexbor_free(html); +lxb_html_encoding_destroy(&em, false); -### Printing the Encoding +FAILED(false, "Failed to determine encoding"); +``` -Once the encoding is determined, the program retrieves the encoding entry using -`lxb_html_encoding_meta_entry`. If a valid entry is found, it prints the -encoding name. If no encoding is determined, it simply outputs that the encoding -was not found. This provides the user with understandable feedback regarding the -HTML file's character encoding. +Here, `lexbor_free` releases the allocated memory for the HTML content, and `lxb_html_encoding_destroy` cleans up the encoding structure. -### Cleanup +## Notes -At the end of the program, whether successful or in the case of an error, memory -cleanup is performed. The `lexbor_free` function is called to release the -allocated memory for the HTML content, and `lxb_html_encoding_destroy` cleans up -the encoding struct. This is an important step to prevent memory leaks and -ensure proper resource management. +- The example limits the bytes read to the first 1024 to save time, as encoding declarations are typically found early in the HTML. +- It uses `lexbor_fs_file_easy_read` for easy file reading, which abstracts away low-level file operations. +- Proper initialization and cleanup are crucial to avoid memory leaks. -## Conclusion +## Summary -The HTML Encoding example demonstrates essential practices such as error -handling, memory management, and the use of a library to enhance functionality. -By following this example, developers can understand how to utilize the `lexbor` -library for encoding detection in HTML documents, while also adhering to proper -coding standards for readability and maintainability. \ No newline at end of file +This example provides a clear, practical demonstration of how to determine the encoding of an HTML file using the `lexbor` library. It covers essential tasks such as initialization, reading file content, detecting encoding, and handling errors. Understanding this example is invaluable for developers needing to ensure correct text processing and rendering in various web-related applications. \ No newline at end of file diff --git a/source/examples/html/html2sexpr.md b/source/examples/html/html2sexpr.md index c2cbba6..21a61bb 100644 --- a/source/examples/html/html2sexpr.md +++ b/source/examples/html/html2sexpr.md @@ -1,121 +1,243 @@ -# HTML to S-Expression Converter Example +# Converting HTML Tag Tree to S-Expressions: Example -This article provides an overview of a code example found in the file -[lexbor/html/html2sexpr.c](https://github.com/lexbor/lexbor/blob/master/examples/lexbor/html/html2sexpr.c). -The program is designed to convert an HTML tag tree into an S-expression string -and output it to standard output. The program utilizes the `lexbor` library to -handle parsing and manipulating HTML documents. +This article provides an in-depth explanation of the code from the file `lexbor/html/html2sexpr.c`. The example demonstrates how to use the `lexbor` library to convert an HTML tag tree into an s-expression string, which is output to `stdout`. It covers the process of reading an HTML file, parsing it into a DOM tree, traversing the tree, and then serializing it into s-expressions. -## Overview +## Key Code Sections -The program first checks if the correct number of command-line arguments is -provided. It expects one argument: the path to an HTML file. It reads the -contents of this file and initializes an HTML document object using `lexbor`'s -API. After parsing the HTML, the program invokes a tree-walking function to -serialize the HTML structure into an S-expression format. The serialized output -is then printed to the console. +### Main Function Logic -## Major Code Sections +The `main` function initializes the HTML document, parses the input file, and invokes the traversal and serialization process. The core of the main function is structured as follows: -### Argument Handling and File Reading +```c +int +main(int argc, const char *argv[]) +{ + if (argc != 2) { + usage(); + FAILED("Invalid number of arguments"); + } -The `main` function begins with argument validation. It ensures that exactly one -argument is received; otherwise, it calls the `usage` function, which prints the -program's usage instructions to standard error. + lxb_status_t status; + lxb_html_document_t *document; + lxb_char_t *html; + size_t html_len; -```c -if (argc != 2) { - usage(); - FAILED("Invalid number of arguments"); -} -``` + html = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &html_len); + if (html == NULL) { + FAILED("Failed to read HTML file"); + } -Upon validation, the program proceeds to read the HTML file using the -`lexbor_fs_file_easy_read` function, which simplifies file reading: + document = lxb_html_document_create(); + if (document == NULL) { + PRINT("Failed to create HTML Document"); + goto failed; + } -```c -html = lexbor_fs_file_easy_read((const lxb_char_t *) argv[1], &html_len); -``` + status = lxb_html_document_parse(document, html, html_len); + if (status != LXB_STATUS_OK) { + PRINT("Failed to parse HTML"); + goto failed; + } -If file reading fails, it reports an error and resizes relevant resources. + status = tree_walker(lxb_dom_interface_node(document)->first_child, + serialize_cb, NULL); + if (status != LXB_STATUS_OK) { + PRINT("Failed to convert HTML to S-Expression"); + goto failed; + } -### HTML Document Initialization and Parsing + lxb_html_document_destroy(document); + lexbor_free(html); -Next, the code creates an HTML document object with `lxb_html_document_create`. -If this allocation fails, it destroys any previously allocated document and -frees the memory associated with the HTML content: + return EXIT_SUCCESS; -```c -document = lxb_html_document_create(); -``` +failed: -After successfully creating the document, the program parses the HTML content: + lxb_html_document_destroy(document); + lexbor_free(html); -```c -status = lxb_html_document_parse(document, html, html_len); + return EXIT_FAILURE; +} ``` -This step processes the HTML string and builds a structured representation of -the document. - -### Traversing the DOM and Serializing to S-Expression +In this sequence, the key steps are: +1. **Reading the HTML File**: The `lexbor_fs_file_easy_read` function reads the HTML file and stores its content in `html`. +2. **Document Creation and Parsing**: The document is created using `lxb_html_document_create` and parsed with `lxb_html_document_parse`. +3. **Tree Traversal**: The `tree_walker` is called to traverse the HTML tree and serialize it. -The `tree_walker` function is the core of the serialization process. It -traverses the DOM tree recursively, converting each node into an S-expression -format. +### Tree Walking and Serialization -It begins by checking the type of each node. For elements, it calls the -serialization callback `cb` to append the opening parenthesis, the node's name, -and any attributes: +The `tree_walker` function recursively traverses the HTML DOM tree and calls the provided callback to serialize each node and its attributes into s-expressions. ```c -if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { - status = cb((const lxb_char_t *) "(", 1, ctx); - ... - // Invokes the attributes function - status = attributes(node, cb, ctx); +static lxb_status_t +tree_walker(lxb_dom_node_t *node, lxb_html_serialize_cb_f cb, void *ctx) +{ + lxb_status_t status; + lxb_dom_node_t *root = node->parent; + + const lxb_char_t *name; + size_t name_len = 0; + + while (node != NULL) { + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + status = cb((const lxb_char_t *) "(", 1, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + name = lxb_dom_element_qualified_name(lxb_dom_interface_element(node), + &name_len); + + status = cb(name, name_len, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + status = attributes(node, cb, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + if (node->local_name == LXB_TAG_TEMPLATE) { + lxb_html_template_element_t *temp = lxb_html_interface_template(node); + if (temp->content != NULL && temp->content->node.first_child != NULL) { + status = tree_walker(&temp->content->node, cb, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + } + } + } + + if (node->first_child != NULL) { + node = node->first_child; + } + else { + // Closing tag + while (node != root && node->next == NULL) { + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + status = cb((const lxb_char_t *) ")", 1, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + } + + node = node->parent; + } + + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + status = cb((const lxb_char_t *) ")", 1, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + } + + if (node == root) { + break; + } + + node = node->next; + } + } + + return LXB_STATUS_OK; +} ``` -The `attributes` function iterates through each node's attributes and formats -them as `(attribute_name 'attribute_value)` pairs, again using the callback to -transmit this information. +This function: +1. **Starts the S-Expression Serialization**: Outputs a `(` followed by the element's name. +2. **Calls `attributes` Function**: Serializes each attribute into s-expressions. +3. **Recursively Processes Template Content**: Handles the special case of `