diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..8670fa1 --- /dev/null +++ b/.rspec @@ -0,0 +1,3 @@ +--format doc +--color +--require spec_helper diff --git a/lib/dom_tree.rb b/lib/dom_tree.rb new file mode 100644 index 0000000..98826dc --- /dev/null +++ b/lib/dom_tree.rb @@ -0,0 +1,76 @@ +class DomTree + attr_reader :string, :document + + def initialize + @document = Node.new("document head", nil, 0, []) + end + + def parse_string(str) + str.scan(/<.*?>|[[a-zA-Z]\d\p{P}\s]*/).map(&:strip).reject(&:empty?) + end + + def build_tree(str) + parsed_html = parse_string(str) + top = @document + parsed_html.each do |item| + node = build_node(item) + top = add_node(node, top) + end + end + + def add_node(node, top) + if opening_tag?(node.type) + add_opening_tag(node, top) + else + add_other_tag(node, top) + end + end + + def add_opening_tag(node, top) + top.children << node + node.children, node.parent, node.depth = + [], top, (top.depth + 1) + node + end + + def add_other_tag(node, top) + if closing_tag?(node.type) + top.parent + else + top.children << node + node.parent, node.depth = top, (top.depth + 1) + top + end + end + + def opening_tag?(item) + item[0] == "<" && item[1] != "/" + end + + def closing_tag?(item) + item[0] == "<" && item[1] == "/" + end + + def build_node(type) + node = Node.new(type) + node.build_attributes_hash if opening_tag?(node.type) + node + end + + def print_to_file + file = File.open('output.html', 'w') + render(@document, file) + file.close + end + + def render(top, file) + file << "#{" " * top.depth}#{top.type}\n" + top.children.each { |element| render(element, file) } if top.children + file << "#{" " * top.depth}#{make_closing(top.type)}>\n" if opening_tag?(top.type) + end + + def make_closing(tag) + match = tag.match(/<(\w*\d*)/).to_s + match.insert(1, "/") + end +end diff --git a/lib/node.rb b/lib/node.rb new file mode 100644 index 0000000..62ca697 --- /dev/null +++ b/lib/node.rb @@ -0,0 +1,15 @@ +Node = Struct.new(:type, :parent, :depth, :children, :attributes) do + + def build_attributes_hash + att_hash = {} + attribute_pairs = self.type.scan((/([[a-zA-Z]\d\p{p}]*)\s*=\s*\"([[a-zA-Z]\p{Pd}\s*]*)"/)) + solo = self.type.scan(/[[a-zA-Z]\d\p{p}]*\s*=\s*\"[[a-zA-Z]\p{Pd}\s*]*"|\s(?]/).flatten + attribute_pairs.each do |item| + item[1] = item[1].split(" ") + att_hash[item[0]] = item[1] + end + solo.each { |item| att_hash[item] = true unless item.nil?} + self.attributes = att_hash + end + +end diff --git a/lib/node_renderer.rb b/lib/node_renderer.rb new file mode 100644 index 0000000..511298f --- /dev/null +++ b/lib/node_renderer.rb @@ -0,0 +1,71 @@ + +class NodeRenderer + + def initialize(tree) + @tree = tree + end + + def render(node) + node = @tree if node == nil + nodes_below(node) + node_type_count(node) + node_attributes(node) + end + + def nodes_below(node) + stack = [node] + count = 0 + while item = stack.pop + count += item.children.length if item.children + stack += add_children_to_stack(item) + end + p "There are #{count} children in this nodes subtree" + end + + def node_type_count(node) + stack = [node] + type_hash = Hash.new(0) + while item = stack.pop + match = get_type(item.type) + type_hash = update_hash(match, type_hash, item, node) + stack += add_children_to_stack(item) + end + print_hash(type_hash) + end + + def add_children_to_stack(item) + stack = [] + if children = item.children + children.each { |child| stack << child } + end + stack + end + + def print_hash(hash) + hash.each do |key, val| + puts "There are #{val} #{key}(s) in this nodes subtree" + end + end + + def update_hash(match, hash, item, node) + if blank_or_star_node?(match, item, node) + match == nil ? hash["text"] += 1 : hash[match] += 1 + end + hash + end + + def blank_or_star_node?(match, item, node) + match != "" && item != node + end + + def get_type(tag) + if match = tag.match(/<([a-z]*\d*)\W/) + match.captures[0] + end + end + + def node_attributes(node) + p node.attributes + end + +end diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb new file mode 100644 index 0000000..99fcdd5 --- /dev/null +++ b/lib/tree_searcher.rb @@ -0,0 +1,56 @@ +require_relative "node" +require_relative "dom_tree" +require_relative "node_renderer" + +class TreeSearcher + + def initialize(tree) + @tree = tree + end + + def search_by(attribute, text) + stack = [@tree] + matching_nodes = [] + while node = stack.pop + matching_nodes += match_attributes(node, attribute, text) + stack += add_children_to_stack(node) + end + matching_nodes + end + + def add_children_to_stack(node) + stack = [] + if children = node.children + children.each { |child| stack << child} + end + stack + end + + def match_attributes(node, attribute, text) + arr = [] + if att = get_attribute(node.type, attribute.to_s) + att.each { |item| arr << node if item == text } + elsif text == node.type + arr << node + end + arr + end + + def get_attribute(str, attribute) + if pattern = str.match(/#{attribute}\s?=\s?"(.*?)"/) + return pattern.captures[0].split if attribute == "class" + pattern.captures + end + end + +end + +dom = DomTree.new +file = File.open("test.html", "rb") +contents = file.read +file.close +dom.build_tree(contents) +searcher = TreeSearcher.new(dom.document) +node = searcher.search_by(:class, "top-div")[0] +NodeRenderer.new(dom.document).render(node) +dom.print_to_file diff --git a/output.html b/output.html new file mode 100644 index 0000000..aef2854 --- /dev/null +++ b/output.html @@ -0,0 +1,75 @@ +document head + + + + + This is a test page + + + +
+ I'm an outer div!!! +
+ I'm an inner div!!! I might just + + emphasize + + some text. +
+ I am EVEN MORE TEXT for the SAME div!!! +
+
+
+

+ Welcome to the test doc! +

+

+ This document contains data +

+
+ +
+ + + diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb new file mode 100755 index 0000000..0d2e751 --- /dev/null +++ b/spec/dom_tree_spec.rb @@ -0,0 +1,60 @@ + +require 'node' +require 'dom_tree' + +describe DomTree do + + let(:dom_tree) { DomTree.new } + let(:dom_tree_error) { DomTree.new("arguments")} + let (:test_html) { "
+ div text before +

+ p text +

+
+ more div text +
+ div text after +
" + } + + describe '#initialize' do + it 'returns an instance of DomTree' do + expect(dom_tree).to be_an_instance_of(DomTree) + end + + it 'will return an error if initalized with an argument' do + expect {dom_tree_error}.to raise_error(ArgumentError) + end + + it 'creates a document node with the type set to document head' do + expect(dom_tree.document.type).to eq("document head") + end + end + + + describe '#parse_string' do + + + it 'takes a string as an argument' do + expect { dom_tree.parse_string("test") }.to_not raise_error + end + + it 'removes html formatting and retuns an array of strings for each element' do + expect(dom_tree.parse_string(test_html)).to eq(["
", "div text before", "

", "p text", "

", "
", "more div text", "
", "div text after", "
"]) + end + + end + + describe '#build_tree' do + it 'takes a string as an argument' do + expect { dom_tree.build_tree("test") }.to_not raise_error + end + + it 'after building a tree the head node has correct number of children' do + dom_tree.build_tree(test_html) + expect(dom_tree.document.children.length).to eq(1) + end + end + +end \ No newline at end of file diff --git a/spec/node_renderer_spec.rb b/spec/node_renderer_spec.rb new file mode 100755 index 0000000..dd6db95 --- /dev/null +++ b/spec/node_renderer_spec.rb @@ -0,0 +1,19 @@ + +require 'node_renderer' + +describe NodeRenderer do + let(:node_render) { NodeRenderer.new("tree")} + + + describe '#intialize' do + it 'returns an instance of NodeRenderer' do + expect(node_render).to be_an_instance_of(NodeRenderer) + end + + it 'takes one argument' do + expect{node_render}.to_not raise_error + end + + end + +end \ No newline at end of file diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100755 index 0000000..ef48f3e --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,97 @@ + +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# The `.rspec` file also contains a few flags that are not defaults but that +# users commonly want. +# +# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + +# The settings below are suggested to provide a good initial experience +# with RSpec, but feel free to customize to your heart's content. +=begin + # These two settings work together to allow you to limit a spec run + # to individual examples or groups you care about by tagging them with + # `:focus` metadata. When nothing is tagged with `:focus`, all examples + # get run. + config.filter_run :focus + config.run_all_when_everything_filtered = true + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ + # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ + # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = 'doc' + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +=end +end diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb new file mode 100755 index 0000000..4832dba --- /dev/null +++ b/spec/tree_searcher_spec.rb @@ -0,0 +1,40 @@ +require 'tree_searcher' + +describe TreeSearcher do + let(:tree_search) {TreeSearcher.new("tree")} + let(:dom_tree) { DomTree.new } + let (:test_html) { "
+ div text before +

+ p text +

+
+ more div text +
+ div text after +
" + } + + + + describe '#initialize' do + it 'returns an instance of TreeSearcher' do + expect(tree_search).to be_an_instance_of(TreeSearcher) + end + + end + + describe '#search_by' do + it 'returns all matching nodes' do + dom_tree.build_tree(test_html) + search = TreeSearcher.new(dom_tree.document) + expect(search.search_by(:class, 'test')[0]).to be_a(Node) + end + + + + end + + + +end \ No newline at end of file diff --git a/spec/warmups_spec.rb b/spec/warmups_spec.rb new file mode 100755 index 0000000..335cafc --- /dev/null +++ b/spec/warmups_spec.rb @@ -0,0 +1 @@ +require 'spec_helper' \ No newline at end of file diff --git a/test.html b/test.html index 6bc7dfd..2ad84a0 100644 --- a/test.html +++ b/test.html @@ -6,7 +6,7 @@ -
+
I'm an outer div!!!
I'm an inner div!!! I might just emphasize some text. @@ -26,7 +26,7 @@

Here is the data:
  • Four list items
  • One unordered list
  • -
  • One h1
  • +
  • One h1
  • One h2
  • One header
  • One main
  • diff --git a/warmups.rb b/warmups.rb new file mode 100644 index 0000000..146272f --- /dev/null +++ b/warmups.rb @@ -0,0 +1,56 @@ + +class DomTree + attr_reader :tag + + def initialize(tag) + @tag = tag + end + + def type + if match = tag.match(/<([a-z]*\d*)\W/) + match.captures[0] + end + end + + def classes + @tag.match(/class\s?=\s?'(.*?)'/).captures[0].split if tag.match(/class\s?=\s?'(.*?)'/) + end + + def id + @tag.match(/id\s?=\s?'(.*?)'/).captures[0] if tag.match(/id\s?=\s?'(.*?)'/) + end + + def name + @tag.match(/name\s?=\s?'(.*?)'/).captures[0] if tag.match(/name\s?=\s?'(.*?)'/) + end + + def title + @tag.match(/title\s?=\s?'(.*?)'/).captures[0] if tag.match(/title\s?=\s?'(.*?)'/) + end + + def src + @tag.match(/src\s?=\s?'(.*?)'/).captures[0] if tag.match(/src\s?=\s?'(.*?)'/) + end + +end + +par = DomTree.new("

    ") +d = DomTree.new("

    ") +i = DomTree.new("") + +p par.type +p par.classes +p par.id +p par.name + +p d.type +p d.classes +p d.id +p d.name + +p i.type +p i.classes +p i.id +p i.name +p i.src +p i.title