diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..7e3eb5c --- /dev/null +++ b/.rspec @@ -0,0 +1,3 @@ +--color +--require spec_helper +--format doc diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..ce3fe7f --- /dev/null +++ b/Gemfile @@ -0,0 +1,6 @@ +# Gemfile + +source 'https://rubygems.org' + +gem 'rspec', '~> 3.5.0' +gem 'guard-rspec', '~> 4.7.3', require: false \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..3874780 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,65 @@ +GEM + remote: https://rubygems.org/ + specs: + coderay (1.1.1) + diff-lcs (1.3) + ffi (1.9.18) + formatador (0.2.5) + guard (2.14.1) + formatador (>= 0.2.4) + listen (>= 2.7, < 4.0) + lumberjack (~> 1.0) + nenv (~> 0.1) + notiffany (~> 0.0) + pry (>= 0.9.12) + shellany (~> 0.0) + thor (>= 0.18.1) + guard-compat (1.2.1) + guard-rspec (4.7.3) + guard (~> 2.1) + guard-compat (~> 1.1) + rspec (>= 2.99.0, < 4.0) + listen (3.1.5) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + ruby_dep (~> 1.2) + lumberjack (1.0.11) + method_source (0.8.2) + nenv (0.3.0) + notiffany (0.1.1) + nenv (~> 0.1) + shellany (~> 0.0) + pry (0.10.4) + coderay (~> 1.1.0) + method_source (~> 0.8.1) + slop (~> 3.4) + rb-fsevent (0.9.8) + rb-inotify (0.9.8) + ffi (>= 0.5.0) + rspec (3.5.0) + rspec-core (~> 3.5.0) + rspec-expectations (~> 3.5.0) + rspec-mocks (~> 3.5.0) + rspec-core (3.5.4) + rspec-support (~> 3.5.0) + rspec-expectations (3.5.0) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.5.0) + rspec-mocks (3.5.0) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.5.0) + rspec-support (3.5.0) + ruby_dep (1.5.0) + shellany (0.0.1) + slop (3.6.0) + thor (0.19.4) + +PLATFORMS + ruby + +DEPENDENCIES + guard-rspec (~> 4.7.3) + rspec (~> 3.5.0) + +BUNDLED WITH + 1.14.4 diff --git a/Guardfile b/Guardfile new file mode 100644 index 0000000..3215f01 --- /dev/null +++ b/Guardfile @@ -0,0 +1,70 @@ +# A sample Guardfile +# More info at https://github.com/guard/guard#readme + +## Uncomment and set this to only include directories you want to watch +# directories %w(app lib config test spec features) \ +# .select{|d| Dir.exists?(d) ? d : UI.warning("Directory #{d} does not exist")} + +## Note: if you are using the `directories` clause above and you are not +## watching the project directory ('.'), then you will want to move +## the Guardfile to a watched dir and symlink it back, e.g. +# +# $ mkdir config +# $ mv Guardfile config/ +# $ ln -s config/Guardfile . +# +# and, you'll have to watch "config/Guardfile" instead of "Guardfile" + +# Note: The cmd option is now required due to the increasing number of ways +# rspec may be run, below are examples of the most common uses. +# * bundler: 'bundle exec rspec' +# * bundler binstubs: 'bin/rspec' +# * spring: 'bin/rspec' (This will use spring if running and you have +# installed the spring binstubs per the docs) +# * zeus: 'zeus rspec' (requires the server to be started separately) +# * 'just' rspec: 'rspec' + +guard :rspec, cmd: "bundle exec rspec" do + require "guard/rspec/dsl" + dsl = Guard::RSpec::Dsl.new(self) + + # Feel free to open issues for suggestions and improvements + + # RSpec files + rspec = dsl.rspec + watch(rspec.spec_helper) { rspec.spec_dir } + watch(rspec.spec_support) { rspec.spec_dir } + watch(rspec.spec_files) + + # Ruby files + ruby = dsl.ruby + dsl.watch_spec_files_for(ruby.lib_files) + + # Rails files + rails = dsl.rails(view_extensions: %w(erb haml slim)) + dsl.watch_spec_files_for(rails.app_files) + dsl.watch_spec_files_for(rails.views) + + watch(rails.controllers) do |m| + [ + rspec.spec.call("routing/#{m[1]}_routing"), + rspec.spec.call("controllers/#{m[1]}_controller"), + rspec.spec.call("acceptance/#{m[1]}") + ] + end + + # Rails config changes + watch(rails.spec_helper) { rspec.spec_dir } + watch(rails.routes) { "#{rspec.spec_dir}/routing" } + watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" } + + # Capybara features specs + watch(rails.view_dirs) { |m| rspec.spec.call("features/#{m[1]}") } + watch(rails.layouts) { |m| rspec.spec.call("features/#{m[1]}") } + + # Turnip features and steps + watch(%r{^spec/acceptance/(.+)\.feature$}) + watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) do |m| + Dir[File.join("**/#{m[1]}.feature")][0] || "spec/acceptance" + end +end diff --git a/README.md b/README.md index eb609bd..17cb21c 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,21 @@ Like leaves on the wind [A data structures, algorithms, file I/O, ruby and regular expression (regex) project from the Viking Code School](http://www.vikingcodeschool.com) + +Worked on by [Roy Chen](https://github.com/roychen25) + +## Getting Started + +To run this program, fork and clone this repository. + +In the cloned directory, run this command: + +``` +ruby example.rb +``` + +The output includes: + +1. Printing information about the root node of the DOM tree created + +2. Rebuilding the DOM tree into its original format diff --git a/example.rb b/example.rb new file mode 100644 index 0000000..cf7171a --- /dev/null +++ b/example.rb @@ -0,0 +1,15 @@ +if $0 == __FILE__ + require_relative './lib/dom_reader' + require_relative './lib/node_renderer' + require_relative './lib/dom_rebuilder' + + dom_reader = DOMReader.new + + tree = dom_reader.build_tree('./test.html') + + node_renderer = NodeRenderer.new(tree) + node_renderer.render + + dom_rebuilder = DOMRebuilder.new(tree) + dom_rebuilder.print_tree +end diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb new file mode 100644 index 0000000..f6aa9b6 --- /dev/null +++ b/lib/dom_reader.rb @@ -0,0 +1,113 @@ +require_relative './dom_tree' + +class DOMReader + def initialize; end + + def build_tree(filename) + # break file into tokens + tokens = tokenize_file(filename) + + # exit if we have no tokens + return nil if tokens.empty? + + # remove doctype declaration, if any + tokens = remove_doctype(tokens) + + # initialize the tree + tree = DOMTree.new + current_node = tree.document + + # set current depth of the tree + current_depth = 0 + + # create tree's root node if it does not exist yet + if tree.document.nil? + token = tokens.shift + tree.document = Node.new(token_type(token), token, current_depth, nil, []) + current_node = tree.document + end + + # process remaining tokens + until tokens.empty? + token = tokens.shift + token_type = token_type(token) + + node = Node.new(token_type, token, current_depth, current_node, []) + current_node.children << node + + case token_type + when :open_tag + current_node = node + current_depth += 1 + when :text + next + when :close_tag + current_node = current_node.parent unless current_node == tree.document + current_depth -= 1 unless current_depth == 0 + when :unknown + # ignore unknown tag for now + next + end + end + + tree + end + + def read_file(filename) + raise "The file to be read does not exist." unless File.exist?(filename) + + File.readlines(filename) + end + + def tokenize_file(filename) + lines = read_file(filename) + tokens = [] + lines.each { |line| tokens << tokenize(line) } + tokens.flatten! + end + + def tokenize(text) + regex = /(<.+?>|[^<>]+|<\/\w+?>)/ + tokens = text.scan(regex).flatten + + tokens + end + + def remove_doctype(tokens) + tokens.delete_at(0) if is_doctype_tag?(tokens[0]) + tokens + end + + def token_type(token) + return :unknown unless token.is_a?(String) + + return :open_tag if is_open_tag?(token) + + return :close_tag if is_close_tag?(token) + + return :text if is_text?(token) + + :unknown + end + + def is_doctype_tag?(text) + regex = //i + !text.match(regex).nil? + end + + def is_open_tag?(text) + # regex = /^<(\w+)>$/ + regex = /^<(\w+)\s*(.+)*>$/ + !text.match(regex).nil? + end + + def is_close_tag?(text) + regex = /^<(\/\w+)>$/ + !text.match(regex).nil? + end + + def is_text?(text) + regex = /^[^<>\/]+$/ + !text.match(regex).nil? + end +end diff --git a/lib/dom_rebuilder.rb b/lib/dom_rebuilder.rb new file mode 100644 index 0000000..076ae5b --- /dev/null +++ b/lib/dom_rebuilder.rb @@ -0,0 +1,30 @@ +class DOMRebuilder + attr_reader :tree + + def initialize(tree = nil) + @tree = tree + end + + # print out the DOM tree using DFS + def print_tree(start_node = nil) + output = "" + + start_node = self.tree.document if start_node.nil? + + stack = [] + stack.push(start_node) + + until stack.empty? + current_node = stack.pop + + output << current_node.content + + # it's important to reverse the child nodes first before + # pushing onto the stack, so that they'll be printed out + # in the right order + current_node.children.reverse.each { |child| stack.push(child) } + end + + puts output + end +end diff --git a/lib/dom_tree.rb b/lib/dom_tree.rb new file mode 100644 index 0000000..4e374cc --- /dev/null +++ b/lib/dom_tree.rb @@ -0,0 +1,50 @@ +Node = Struct.new(:type, :content, :depth, :parent, :children) + +class DOMTree + attr_accessor :document + + def initialize(document = nil) + # root node of tree + @document = document + end + + # methods for traversing tree to be added here + + def num_nodes_below(node = nil) + return 0 if node.nil? || node.children.empty? + + count = 0 + queue = [] + node.children.each { |child| queue << child } + + until queue.empty? + current_node = queue.shift + count += 1 + current_node.children.each { |child| queue << child unless child.nil? } + end + + count + end + + def node_types_below(node = nil) + return {} if node.nil? || node.children.empty? + + node_types = {} + + queue = [] + node.children.each { |child| queue << child } + + until queue.empty? + current_node = queue.shift + current_node.children.each { |child| queue << child unless child.nil? } + + if node_types.keys.include?(current_node.type) + node_types[current_node.type] += 1 + else + node_types[current_node.type] = 1 + end + end + + node_types + end +end diff --git a/lib/node_renderer.rb b/lib/node_renderer.rb new file mode 100644 index 0000000..6492c0a --- /dev/null +++ b/lib/node_renderer.rb @@ -0,0 +1,38 @@ +class NodeRenderer + attr_reader :tree + + def initialize(tree = nil) + @tree = tree + end + + def render(node = nil) + display_data_attributes(node) + display_num_nodes_below(node) + display_node_types_below(node) + end + + def display_data_attributes(node = nil) + node = self.tree.document if node.nil? + + puts "=== Node data attributes ===" + puts "Node type: #{node.type}" + puts "Node content: #{node.content.inspect}" + puts "Node depth: #{node.depth}" + puts "Children: #{node.children.length}\n\n" + end + + def display_num_nodes_below(node = nil) + node = self.tree.document if node.nil? + + puts "=== Number of nodes in subtree(s) below ===" + puts "#{self.tree.num_nodes_below(node)}\n\n" + end + + def display_node_types_below(node = nil) + node = self.tree.document if node.nil? + node_types = self.tree.node_types_below(node) + + puts "=== Node types in subtree(s) below ===" + puts node_types + end +end diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb new file mode 100644 index 0000000..191af98 --- /dev/null +++ b/lib/tree_searcher.rb @@ -0,0 +1,60 @@ +class TreeSearcher + attr_reader :tree + + def initialize(tree = nil) + @tree = tree + end + + def search_by(attribute, value) + case attribute + when :name + regex = /^<#{value}.*>$/i + node_type = value.start_with?("/") ? :close_tag : :open_tag + when :text + regex = /#{value}/i + node_type = :text + when :id + regex = /id\s*=\s*['"]#{value}['"]/i + node_type = :open_tag + when :class + regex = /class\s*=\s*['"].*?#{value}.*?['"]/i + node_type = :open_tag + else + puts "Invalid attribute to search for." + return nil + end + + find_nodes_by(node_type, regex, value) + end + + def search_descendents(start_node, attribute, value) + nodes = search_by(attribute, value) + + nodes.select { |node| node.depth > start_node.depth } + end + + def search_ancestors(start_node, attribute, value) + nodes = search_by(attribute, value) + + nodes.select { |node| node.depth < start_node.depth } + end + + private + + def find_nodes_by(node_type, regex, value) + # perform BFS on tree to find matching nodes + collection = [] + queue = [] + + queue << self.tree.document + until queue.empty? + current_node = queue.shift + + collection << current_node if current_node.content.match(regex) && current_node.type == node_type + + current_node.children.each { |child| queue << child unless child.nil? } + end + + collection + end +end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb new file mode 100644 index 0000000..0130947 --- /dev/null +++ b/spec/dom_reader_spec.rb @@ -0,0 +1,166 @@ +# spec/dom_reader_spec.rb + +require 'dom_reader' + +describe "DOMReader" do + let(:dom_reader) { DOMReader.new } + + describe "#initialize" do + it "creates an instance of DOMReader" do + expect(dom_reader).to be_a(DOMReader) + end + end + + describe "#build_tree" do + it "returns a tree of Nodes built from the input filename" do + filename = './test.html' + contents = ["\n", "\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", "\n"] + + allow(File).to receive(:readlines).with(filename).and_return(contents) + + tree = dom_reader.build_tree(filename) + expect(tree).to be_a(DOMTree) + expect(tree.document).to be_a(Node) + expect(tree.document.type).to eq(:text) + expect(tree.document.children.first.type).to eq(:open_tag) + expect(tree.document.children.first.content).to eq("") + end + end + + describe "#read_file" do + it "raises an error if the file to be read does not exist" do + expect { dom_reader.read_file("blah") }.to raise_error(/file to be read does not exist/) + end + + it "returns the file's contents as an array" do + filename = './test.html' + contents = ["\n", "\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", "\n"] + + allow(File).to receive(:readlines).with(filename).and_return(contents) + + expect(dom_reader.read_file(filename)).to be_a(Array) + expect(dom_reader.read_file(filename)).to eq(contents) + end + end + + describe "#tokenize_file" do + it "returns the file's contents as an array of tokens" do + filename = './test.html' + contents = ["\n", "\n", "\n"] + + allow(File).to receive(:readlines).with(filename).and_return(contents) + + expected_result = ["", "\n", "", "\n", "", "\n"] + + expect(dom_reader.tokenize_file(filename)).to eq(expected_result) + end + end + + describe "#tokenize" do + it "breaks down the input string into tokens" do + test_string = "This document contains data\n" + tokens = dom_reader.tokenize(test_string) + + expect(tokens.length).to eq(1) + expect(tokens[0]).to eq("This document contains data\n") + end + + it "correctly tokenizes mixed content on the same line" do + test_string = "some text" + tokens = dom_reader.tokenize(test_string) + + expect(tokens.length).to eq(3) + expect(tokens[0]).to eq("") + expect(tokens[1]).to eq("some text") + expect(tokens[2]).to eq("") + end + end + + describe "#remove_doctype" do + it "removes any doctype declaration tokens from the provided list of tokens" do + tokens = ["", "\n", "", "\n", "", "\n"] + expected_result = ["\n", "", "\n", "", "\n"] + + expect(dom_reader.remove_doctype(tokens)).to eq(expected_result) + end + end + + describe "#token_type" do + it "returns :open_tag if the input is an opening tag" do + token = "" + expect(dom_reader.token_type(token)).to eq(:open_tag) + + token = "" + expect(dom_reader.token_type(token)).to eq(:open_tag) + end + + it "returns :close_tag if the input is a closing tag" do + token = "" + expect(dom_reader.token_type(token)).to eq(:close_tag) + end + + it "returns :text if the input is just text" do + token = "plaintext" + expect(dom_reader.token_type(token)).to eq(:text) + + token = "This is some mixed text!! Blah blah 234 hrgargh" + expect(dom_reader.token_type(token)).to eq(:text) + end + + it "returns :unknown otherwise" do + token = ["an", "array"] + expect(dom_reader.token_type(token)).to eq(:unknown) + + token = "foobar")).to be true + expect(dom_reader.is_open_tag?("")).to be true + expect(dom_reader.is_open_tag?("")).to be true + end + + it "returns false otherwise" do + expect(dom_reader.is_open_tag?("text")).to be false + expect(dom_reader.is_open_tag?("")).to be false + end + end + + describe "#is_close_tag?" do + it "returns true if the argument is a closing tag" do + expect(dom_reader.is_close_tag?("")).to be true + end + + it "returns false otherwise" do + expect(dom_reader.is_close_tag?("")).to be false + expect(dom_reader.is_close_tag?("text")).to be false + end + end + + describe "#is_text?" do + it "returns true if the argument is just text (not a tag)" do + expect(dom_reader.is_text?("text")).to be true + expect(dom_reader.is_text?("multiple words here!!!")).to be true + end + + it "returns false otherwise" do + expect(dom_reader.is_text?("")).to be false + expect(dom_reader.is_text?("")).to be false + end + end +end diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb new file mode 100644 index 0000000..aeb31e1 --- /dev/null +++ b/spec/dom_tree_spec.rb @@ -0,0 +1,98 @@ +# spec/dom_tree_spec.rb + +require 'dom_tree' + +describe "DOMTree" do + let(:domtree) { DOMTree.new } + + describe "#initialize" do + it "creates an instance of DOMTree" do + expect(domtree).to be_a(DOMTree) + end + end + + context "instance variables" do + it "allows the instance variable @document to be read" do + test_domtree = DOMTree.new("document") + expect(test_domtree.document).to eq("document") + end + + it "allows the instance variable @document to be set" do + test_domtree = DOMTree.new("document") + test_domtree.document = "new document" + expect(test_domtree.document).to eq("new document") + end + end + + describe "#num_nodes_below" do + it "returns the number of nodes in the sub-tree below the specified node" do + node1 = Node.new(:open_tag, "", 0, nil, []) + node2 = Node.new(:text, "text", 0, node1, []) + node3 = Node.new(:close_tag, "", 0, node1, []) + + node1.children << node2 + node2.children << node3 + + expect(domtree.num_nodes_below(node1)).to eq(2) + expect(domtree.num_nodes_below(node2)).to eq(1) + expect(domtree.num_nodes_below(node3)).to eq(0) + end + end + + describe "#node_types_below" do + it "returns the types and quantities of nodes in the sub-tree below the specified node" do + node1 = Node.new(:open_tag, "", 0, nil, []) + node2 = Node.new(:text, "text", 0, node1, []) + node3 = Node.new(:close_tag, "", 0, node1, []) + + node1.children << node2 + node2.children << node3 + node_types = domtree.node_types_below(node1) + + expect(node_types).to be_a(Hash) + expect(node_types.keys.length).to eq(2) + expect(node_types.keys).to include(:text) + expect(node_types.keys).to include(:close_tag) + expect(node_types[:text]).to eq(1) + expect(node_types[:close_tag]).to eq(1) + end + end + + context "Node struct" do + let(:node) { Node.new(:open_tag, "

", 0, nil, []) } + + context "Node attributes" do + it "has a :type attribute" do + expect(node.type).to eq(:open_tag) + node.type = :close_tag + expect(node.type).to eq(:close_tag) + end + + it "has a :content attribute" do + expect(node.content).to eq("

") + node.content = "

" + expect(node.content).to eq("

") + end + + it "has a :depth attribute" do + expect(node.depth).to eq(0) + node.depth = 1 + expect(node.depth).to eq(1) + end + + it "has a :parent attribute" do + expect(node.parent).to be nil + node.parent = Node.new(:text, "foobar", 0, nil, []) + expect(node.parent.content).to eq("foobar") + end + + it "has a :children attribute" do + expect(node.children).to be_a(Array) + expect(node.children.length).to eq(0) + node.children << "child" + expect(node.children[0]).to eq("child") + expect(node.children.length).to eq(1) + end + end + end +end diff --git a/spec/node_renderer_spec.rb b/spec/node_renderer_spec.rb new file mode 100644 index 0000000..ebf0d70 --- /dev/null +++ b/spec/node_renderer_spec.rb @@ -0,0 +1,46 @@ +# spec/node_renderer_spec.rb + +require 'dom_tree' +require 'node_renderer' + +describe "NodeRenderer" do + describe "#initialize" do + it "creates an instance of NodeRenderer" do + expect(NodeRenderer.new).to be_a(NodeRenderer) + end + end + + context "instance variables" do + describe "#tree" do + it "returns the value of the instance variable @tree" do + test_node_renderer = NodeRenderer.new("tree") + expect(test_node_renderer.tree).to eq("tree") + end + end + end + + describe "#render" do + node1 = Node.new(:open_tag, "", 0, nil, []) + node2 = Node.new(:text, "text", 0, node1, []) + node3 = Node.new(:close_tag, "", 0, node1, []) + tree = DOMTree.new(node1) + node_renderer = NodeRenderer.new(tree) + + before(:each) { allow(node_renderer).to receive(:puts).and_return(nil) } + + it "displays all the input node's data attributes" do + expect(node_renderer).to receive(:display_data_attributes) + node_renderer.render + end + + it "displays the total number of nodes in the sub-tree below the input node" do + expect(node_renderer).to receive(:display_num_nodes_below) + node_renderer.render + end + + it "displays a count of each node type in the sub-tree below the input node" do + expect(node_renderer).to receive(:display_node_types_below) + node_renderer.render + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..47b39ce --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,103 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# The `.rspec` file also contains a few flags that are not defaults but that +# users commonly want. +# +# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + +# The settings below are suggested to provide a good initial experience +# with RSpec, but feel free to customize to your heart's content. +=begin + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ + # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ + # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = 'doc' + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +=end +end diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb new file mode 100644 index 0000000..b63af64 --- /dev/null +++ b/spec/tree_searcher_spec.rb @@ -0,0 +1,141 @@ +# spec/tree_searcher_spec.rb + +require 'dom_tree' +require 'tree_searcher' + +describe "TreeSearcher" do + describe "#initialize" do + it "creates an instance of TreeSearcher" do + expect(TreeSearcher.new).to be_a(TreeSearcher) + end + end + + context "instance variables" do + describe "#tree" do + it "returns the value of the instance variable @tree" do + test_tree_searcher = TreeSearcher.new("tree") + expect(test_tree_searcher.tree).to eq("tree") + end + end + end + + context "searching for nodes" do + # setup nodes for integration test + node1 = Node.new(:open_tag, "
", 0, nil, []) + node2 = Node.new(:text, "Some text right here!", 0, node1, []) + node3 = Node.new(:close_tag, "
", 0, node1, []) + + node4 = Node.new(:open_tag, "

", 1, node1, []) + node5 = Node.new(:text, "Some p text right here!", 1, node4, []) + node6 = Node.new(:close_tag, "

", 1, node4, []) + + node1.children << node2 + node1.children << node4 + node1.children << node3 + + node4.children << node5 + node4.children << node6 + + # setup tree and TreeSearcher instance + tree = DOMTree.new(node1) + tree_searcher = TreeSearcher.new(tree) + + describe "#search_by" do + context "searching for nodes by their tag name" do + it "returns the correct nodes when searching for an opening tag" do + results = tree_searcher.search_by(:name, 'div') + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end + + it "returns the correct nodes when searching for a closing tag" do + results = tree_searcher.search_by(:name, '/div') + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node3) + end + end + + context "searching for nodes by text" do + it "returns the correct nodes when searching by text" do + results = tree_searcher.search_by(:text, "text right here") + + expect(results).to be_a(Array) + expect(results.length).to eq(2) + expect(results).to include(node2) + expect(results).to include(node5) + + results = tree_searcher.search_by(:text, "P TEXT") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results).to include(node5) + end + end + + context "searching for nodes by id" do + it "returns the correct nodes when searching by id" do + results = tree_searcher.search_by(:id, "bar") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end + end + + context "searching for nodes by class" do + it "returns the correct nodes when searching by class" do + results = tree_searcher.search_by(:class, "foo") + + expect(results).to be_a(Array) + expect(results.length).to eq(2) + expect(results).to include(node1) + expect(results).to include(node4) + + results = tree_searcher.search_by(:class, "baz") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end + end + end + + describe "#search_descendents" do + it "returns the same results as #search_by, but only includes the descendents of the input node" do + root = tree_searcher.tree.document + results = tree_searcher.search_descendents(root, :class, "foo") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node4) + + results = tree_searcher.search_descendents(root, :text, "right here") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node5) + end + end + + describe "search_ancestors" do + it "returns the same results as #search_by, but only includes the ancestors of the input node" do + start_node = node6 + results = tree_searcher.search_ancestors(start_node, :class, "foo") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + + results = tree_searcher.search_ancestors(start_node, :text, "right here") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node2) + end + end + end +end diff --git a/warmups/design_pseudocode.md b/warmups/design_pseudocode.md new file mode 100644 index 0000000..8982e00 --- /dev/null +++ b/warmups/design_pseudocode.md @@ -0,0 +1,51 @@ +### Pseudocoding the design + +``` +# DOMReader +Create an instance of DOMReader +Read in the provided HTML file +Parse it into individual tokens using regular expressions +Process each token to ensure that each edge case is handled: + - a tag with text before it, e.g. text, or text + - a tag with text after it, e.g. blah, or bar +Ensure that these tokens are processed and further broken down +Store all tokens in an array + +# DOMParser +Pass the token array to an instance of DOMParser +Implement a Node struct, with these attributes: + - :type, for the type of tag + - :content, to store the text content + - :attributes, an array for this tag's attributes, if any + - :depth, for the depth of this node in the tree + - :parent, for this node's parent node + - :children, an array to store any children this node might have + +Initialize a depth counter to 0 +Set the current node to nil +Dequeue the first element of the token array, and set it as the current token +If the current token is an opening tag + Create a new Node struct + Parse the token's attributes, if any + Set the Node's type, attributes, depth accordingly + If the root of the tree is nil + Set the Node's parent to nil + Set the Node's children to an empty array + Set the current node to the root Node + Else + Add this Node as the last child of the current Node's children + Set this Node's parent to the current Node + Set the current Node to this Node + Increment the current depth +If the current token is just text + Create a new Node + Set the Node's type to :text + Set the Node's content accordingly + Set the Node's parent to the current node +If the current token is a closing tag + Set the current node to the current node's parent + Decrement the current depth +Continue dequeueing and processing tokens from the array until the array is empty +Return the built tree + +``` diff --git a/warmups/html_parser.rb b/warmups/html_parser.rb new file mode 100644 index 0000000..2c00d08 --- /dev/null +++ b/warmups/html_parser.rb @@ -0,0 +1,129 @@ +class HTMLParser + Node = Struct.new(:type, :content, :attributes, :parent, :children) + + attr_reader :html_string, + :tokens, + :root + + def initialize(html_string) + @html_string = html_string + @tokens = tokenize(html_string) + @root = nil + + # pass in a dup of @tokens to ensure we don't modify @tokens + # while building the tree + build_tree(self.tokens.dup) + end + + def print_tree(start_node) + unless start_node.nil? + if start_node.type == :text + print "#{start_node.content} " + elsif start_node.type == :tag + puts "\n#{start_node.content}" + start_node.children.each { |child_node| print_tree(child_node) } + print "\n#{start_node.content[0]}/#{start_node.content[1..-1]}\n" + end + end + end + + def build_tree(tokens) + current_node = self.root + + until tokens.empty? + token = tokens.shift + + # if the token is text, add create a new :text Node + # and add it to the current node's children + if is_text?(token) + current_node.children << Node.new(:text, token, {}, current_node, []) + elsif is_open_tag?(token) + # create the root node if it does not exist yet + if current_node.nil? + @root = Node.new(:tag, token, {}, nil, []) + current_node = self.root + else + # we already have a Node (which may or may not be the root node), + # but now we have a new opening tag, so create a new Node as one of + # the current node's children, set the new node's parent to the + # current node, then change the current node + current_node.children << Node.new(:tag, token, {}, current_node, []) + current_node = current_node.children.last + end + elsif is_close_tag?(token) + # we found a closing tag, move the current node back + # to the current node's parent + current_node = current_node.parent + end + end + end + + def is_open_tag?(token) + regex = /^<(\w+)>$/ + !token.match(regex).nil? + end + + def is_close_tag?(token) + regex = /^<(\/\w+)>$/ + !token.match(regex).nil? + end + + def is_text?(token) + regex = /^[^<>\/]+$/ + !token.match(regex).nil? + end + + # split up the HTML string into individual tokens, + # so that we can create Nodes from them + def tokenize(html_string) + regex = /(?)/ + tokens = html_string.scan(regex).flatten + + # now process each token to ensure that it really is a distinct, + # separate token + tokens.each do |token| + processed_token = process_token(token) + tokens[tokens.index(token)] = processed_token if processed_token + end + + @tokens = tokens.flatten + end + + def process_token(token) + # we want to further check the token for edge cases + # e.g. a token could combine both a tag and content: + # after + # mid + # tag) + # we want to break these into two separate tokens, + # returning them as an array + + regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ + captures = token.match(regex).captures unless token.match(regex).nil? + + # check for text before/after a tag + unless captures.nil? + text_before_tag = captures[0] + text_after_tag = captures[1] + + if text_before_tag + tag = token.sub(text_before_tag, "") + return [text_before_tag, tag] + elsif text_after_tag + tag = token.sub(text_after_tag, "") + return [tag, text_after_tag] + end + end + end +end + +if __FILE__ == $0 + html_string = "
div text before

p text

more div text
div text after
" + + html_parser = HTMLParser.new(html_string) + + p html_parser.tokens + p html_parser.root + + html_parser.print_tree(html_parser.root) +end diff --git a/warmups/tag_parser.rb b/warmups/tag_parser.rb new file mode 100644 index 0000000..f67af3a --- /dev/null +++ b/warmups/tag_parser.rb @@ -0,0 +1,53 @@ +# it may be possible to use an OpenStruct instead, so that we can use dot +# notation to access the tag's attributes +# +# require 'ostruct' +# Tag = OpenStruct.new(:type, :class, :id, :name) +# tag = Tag.new +# tag.new_attr = new_value +# +# or we can create the OpenStruct by passing in a hash +# tag = OpenStruct.new(hash) + +class TagParser + def initialize; end + + # parse the tag, initialize a hash to store the tag's type/attributes + def parse_tag(tag) + tag_type = find_tag_type(tag) + tag_attributes = find_tag_attributes(tag) + + parsed_tag = {} + parsed_tag[tag_type.to_sym] = tag_type + + tag_attributes.each do |tag_attribute| + attr = tag_attribute[0] + value = tag_attribute[1].split + value = value.first if value.length == 1 + parsed_tag[attr.to_sym] = value + end + + parsed_tag + end + + private + + # find the tag type + def find_tag_type(tag) + type_regex = /^<(\w+)/ + type_match = tag.match(type_regex) + type_match.captures.first + end + + # find all the tag's attributes + def find_tag_attributes(tag) + attribute_regex = /(\w+)=['"](.+?)['"]/ + attribute_matches = tag.scan(attribute_regex) + end +end + +if __FILE__ == $0 + parser = TagParser.new + tag = parser.parse_tag("

") + p tag +end