From cbbdd62985182071963d2479598f2e257e5545ca Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 13 Apr 2017 10:00:46 +0800 Subject: [PATCH 001/104] added name to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index eb609bd..03431dc 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,5 @@ Like leaves on the wind [A data structures, algorithms, file I/O, ruby and regular expression (regex) project from the Viking Code School](http://www.vikingcodeschool.com) + +Worked on by [Roy Chen](https://github.com/roychen25) From 74aaa55f3670abfcb7477d4789764ffe052bee46 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Fri, 14 Apr 2017 07:13:36 +0800 Subject: [PATCH 002/104] finished warmup 1 --- warmup1.rb | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 warmup1.rb diff --git a/warmup1.rb b/warmup1.rb new file mode 100644 index 0000000..f67af3a --- /dev/null +++ b/warmup1.rb @@ -0,0 +1,53 @@ +# it may be possible to use an OpenStruct instead, so that we can use dot +# notation to access the tag's attributes +# +# require 'ostruct' +# Tag = OpenStruct.new(:type, :class, :id, :name) +# tag = Tag.new +# tag.new_attr = new_value +# +# or we can create the OpenStruct by passing in a hash +# tag = OpenStruct.new(hash) + +class TagParser + def initialize; end + + # parse the tag, initialize a hash to store the tag's type/attributes + def parse_tag(tag) + tag_type = find_tag_type(tag) + tag_attributes = find_tag_attributes(tag) + + parsed_tag = {} + parsed_tag[tag_type.to_sym] = tag_type + + tag_attributes.each do |tag_attribute| + attr = tag_attribute[0] + value = tag_attribute[1].split + value = value.first if value.length == 1 + parsed_tag[attr.to_sym] = value + end + + parsed_tag + end + + private + + # find the tag type + def find_tag_type(tag) + type_regex = /^<(\w+)/ + type_match = tag.match(type_regex) + type_match.captures.first + end + + # find all the tag's attributes + def find_tag_attributes(tag) + attribute_regex = /(\w+)=['"](.+?)['"]/ + attribute_matches = tag.scan(attribute_regex) + end +end + +if __FILE__ == $0 + parser = TagParser.new + tag = parser.parse_tag("

") + p tag +end From f96ac37dce069d6bd2bbfc4314fdd086ca30f2b8 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Fri, 14 Apr 2017 07:16:24 +0800 Subject: [PATCH 003/104] renamed file --- warmup1.rb => tag_parser.rb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename warmup1.rb => tag_parser.rb (100%) diff --git a/warmup1.rb b/tag_parser.rb similarity index 100% rename from warmup1.rb rename to tag_parser.rb From c7b1d4ea77df65707b56b323811c9a5c4f9ab765 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Fri, 14 Apr 2017 19:45:39 +0800 Subject: [PATCH 004/104] initial commit for Warmup 2 --- html_parser.rb | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 html_parser.rb diff --git a/html_parser.rb b/html_parser.rb new file mode 100644 index 0000000..098dbd5 --- /dev/null +++ b/html_parser.rb @@ -0,0 +1,44 @@ +class HtmlParser + def initialize; end + + def parse(html_string) + + end + + def find_start_tag(html_string) + regex = /^(<\w+>)/ + match = html_string.match(regex) + match.captures[0] + end + + def find_end_tag(html_string) + regex = /(<\/\w+>)$/ + match = html_string.match(regex) + match.captures[0] + end + + def find_content_between_tags(html_string) + start_tag = find_start_tag(html_string) + end_tag = find_end_tag(html_string) + regex = /^#{start_tag}(.*)#{end_tag}$/ + match = html_string.match(regex) + match.captures[0] + end + + # helper method to check if the HTML string has tags + # base case for recursion == string has no other tags, create text node + def has_tags?(html_string) + regex = /<\w+>/ + html_string.scan(regex).length > 0 + end +end + +if __FILE__ == $0 + html_string = "

div text before

p text

more div text
div text after
" + + parser = HtmlParser.new + p parser.find_start_tag(html_string) + p parser.find_end_tag(html_string) + p parser.find_content_between_tags(html_string) + p parser.has_tags?(html_string) +end From 776d94b32a761f68dbde2dd24fc14b308e68597f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Fri, 14 Apr 2017 21:13:47 +0800 Subject: [PATCH 005/104] added Struct, changed #initialize, renamed method --- html_parser.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/html_parser.rb b/html_parser.rb index 098dbd5..1e34963 100644 --- a/html_parser.rb +++ b/html_parser.rb @@ -1,8 +1,14 @@ class HtmlParser - def initialize; end + Node = Struct.new(:type, :attributes, :parent, :children) - def parse(html_string) + attr_reader :root + def initialize(html_string) + # root of tree of Nodes that we'll be building + @root = nil + end + + def build_tree(html_string) end def find_start_tag(html_string) From 1af2dbf2cf7f5812c8562bb73ff16efe6ad761ad Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 15 Apr 2017 20:37:36 +0800 Subject: [PATCH 006/104] starting over for Warmup 2 --- html_parser.rb | 45 ++++----------------------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/html_parser.rb b/html_parser.rb index 1e34963..5308419 100644 --- a/html_parser.rb +++ b/html_parser.rb @@ -1,50 +1,13 @@ -class HtmlParser - Node = Struct.new(:type, :attributes, :parent, :children) - - attr_reader :root +class HTMLParser + attr_reader :html_string def initialize(html_string) - # root of tree of Nodes that we'll be building - @root = nil - end - - def build_tree(html_string) - end - - def find_start_tag(html_string) - regex = /^(<\w+>)/ - match = html_string.match(regex) - match.captures[0] - end - - def find_end_tag(html_string) - regex = /(<\/\w+>)$/ - match = html_string.match(regex) - match.captures[0] - end - - def find_content_between_tags(html_string) - start_tag = find_start_tag(html_string) - end_tag = find_end_tag(html_string) - regex = /^#{start_tag}(.*)#{end_tag}$/ - match = html_string.match(regex) - match.captures[0] - end - - # helper method to check if the HTML string has tags - # base case for recursion == string has no other tags, create text node - def has_tags?(html_string) - regex = /<\w+>/ - html_string.scan(regex).length > 0 + @html_string = html_string end end if __FILE__ == $0 html_string = "
div text before

p text

more div text
div text after
" - parser = HtmlParser.new - p parser.find_start_tag(html_string) - p parser.find_end_tag(html_string) - p parser.find_content_between_tags(html_string) - p parser.has_tags?(html_string) + html_parser = HTMLParser.new(html_string) end From 7a09c0eceebf47bb72f79323ae65dfa33c551493 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 16 Apr 2017 15:02:22 +0800 Subject: [PATCH 007/104] implemented #tokenize and #process_token methods --- html_parser.rb | 55 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/html_parser.rb b/html_parser.rb index 5308419..9aad40e 100644 --- a/html_parser.rb +++ b/html_parser.rb @@ -1,8 +1,55 @@ class HTMLParser - attr_reader :html_string + attr_reader :html_string, + :tokens def initialize(html_string) @html_string = html_string + @tokens = [] + end + + # split up the HTML string into individual tokens, + # so that we can create Nodes from them + def tokenize(html_string) + regex = /(?)/ + tokens = html_string.scan(regex).flatten + + # now process each token to ensure that it really is a distinct, + # separate token + tokens.each do |token| + processed_token = process_token(token) + tokens[tokens.index(token)] = processed_token if processed_token + end + + @tokens = tokens.flatten + end + + def process_token(token) + # we want to further check the token for edge cases + # e.g. a token could combine both a tag and content: + # after + # mid + # tag) + # we want to break these into two separate tokens, + # returning them as an array + + regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ + captures = token.match(regex).captures unless token.match(regex).nil? + + unless captures.nil? + # check for pre/post match + text_before_tag = captures[0] + text_after_tag = captures[1] + + tag = nil + + if text_before_tag + tag = token.sub(text_before_tag, "") + return [text_before_tag, tag] + elsif text_after_tag + tag = token.sub(text_after_tag, "") + return [tag, text_after_tag] + end + end end end @@ -10,4 +57,10 @@ def initialize(html_string) html_string = "
div text before

p text

more div text
div text after
" html_parser = HTMLParser.new(html_string) + + test_strings = ["after", "mid", "tag)", ""] + test_strings.each { |string| p html_parser.process_token(string) } + + html_parser.tokenize(html_string) + p html_parser.tokens end From 2cb0dc69713ee784af312ca0f2f66b239608d32e Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 16 Apr 2017 15:10:36 +0800 Subject: [PATCH 008/104] refactored #process_token --- html_parser.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/html_parser.rb b/html_parser.rb index 9aad40e..cf3d1bf 100644 --- a/html_parser.rb +++ b/html_parser.rb @@ -35,13 +35,11 @@ def process_token(token) regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ captures = token.match(regex).captures unless token.match(regex).nil? + # check for text before/after a tag unless captures.nil? - # check for pre/post match text_before_tag = captures[0] text_after_tag = captures[1] - tag = nil - if text_before_tag tag = token.sub(text_before_tag, "") return [text_before_tag, tag] From 8919e3e2f8bebcf2dd062ecc6ecfaa628819cdf7 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 16 Apr 2017 21:17:03 +0800 Subject: [PATCH 009/104] added helper methods to check if a token is an open tag/close tag/text --- html_parser.rb | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/html_parser.rb b/html_parser.rb index cf3d1bf..46d9216 100644 --- a/html_parser.rb +++ b/html_parser.rb @@ -1,4 +1,6 @@ class HTMLParser + Node = Struct.new(:type, :attributes, :parent, :children) + attr_reader :html_string, :tokens @@ -7,6 +9,22 @@ def initialize(html_string) @tokens = [] end + def is_open_tag?(token) + regex = /^<(\w+)>$/ + !token.match(regex).nil? + end + + def is_close_tag?(token) + regex = /^<(\/\w+)>$/ + !token.match(regex).nil? + end + + def is_text?(token) + regex = /^[^<>\/]+$/ + !token.match(regex).nil? + end + + # split up the HTML string into individual tokens, # so that we can create Nodes from them def tokenize(html_string) @@ -55,10 +73,4 @@ def process_token(token) html_string = "
div text before

p text

more div text
div text after
" html_parser = HTMLParser.new(html_string) - - test_strings = ["after", "mid", "tag)", ""] - test_strings.each { |string| p html_parser.process_token(string) } - - html_parser.tokenize(html_string) - p html_parser.tokens end From 526fdf57727ea4705cae991544f51c5c0777f811 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 16 Apr 2017 22:11:04 +0800 Subject: [PATCH 010/104] implemented #build_tree method with Node struct --- html_parser.rb | 47 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/html_parser.rb b/html_parser.rb index 46d9216..58b7a47 100644 --- a/html_parser.rb +++ b/html_parser.rb @@ -1,12 +1,49 @@ class HTMLParser - Node = Struct.new(:type, :attributes, :parent, :children) + Node = Struct.new(:type, :content, :attributes, :parent, :children) attr_reader :html_string, - :tokens + :tokens, + :root def initialize(html_string) @html_string = html_string - @tokens = [] + @tokens = tokenize(html_string) + @root = nil + + # pass in a dup of @tokens to ensure we don't modify @tokens + # while building the tree + build_tree(self.tokens.dup) + end + + def build_tree(tokens) + current_node = self.root + + until tokens.empty? + token = tokens.shift + + # if the token is text, add create a new :text Node + # and add it to the current node's children + if is_text?(token) + current_node.children << Node.new(:text, token, {}, current_node, []) + elsif is_open_tag?(token) + # create the root node if it does not exist yet + if current_node.nil? + @root = Node.new(:tag, token, {}, nil, []) + current_node = self.root + else + # we already have a Node (which may or may not be the root node), + # but now we have a new opening tag, so create a new Node as one of + # the current node's children, set the new node's parent to the + # current node, then change the current node + current_node.children << Node.new(:open_tag, token, {}, current_node, []) + current_node = current_node.children.last + end + elsif is_close_tag?(token) + # we found a closing tag, move the current node back + # to the current node's parent + current_node = current_node.parent + end + end end def is_open_tag?(token) @@ -24,7 +61,6 @@ def is_text?(token) !token.match(regex).nil? end - # split up the HTML string into individual tokens, # so that we can create Nodes from them def tokenize(html_string) @@ -73,4 +109,7 @@ def process_token(token) html_string = "
div text before

p text

more div text
div text after
" html_parser = HTMLParser.new(html_string) + + p html_parser.tokens + p html_parser.root end From d337e90257980ed4b748e25ba75fa459df5cf210 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 17 Apr 2017 16:51:12 +0800 Subject: [PATCH 011/104] fixed typo in #build_tree method --- html_parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html_parser.rb b/html_parser.rb index 58b7a47..8502266 100644 --- a/html_parser.rb +++ b/html_parser.rb @@ -35,7 +35,7 @@ def build_tree(tokens) # but now we have a new opening tag, so create a new Node as one of # the current node's children, set the new node's parent to the # current node, then change the current node - current_node.children << Node.new(:open_tag, token, {}, current_node, []) + current_node.children << Node.new(:tag, token, {}, current_node, []) current_node = current_node.children.last end elsif is_close_tag?(token) From fcb49788742147eb351572bc8750dea9df3875cb Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 17 Apr 2017 17:08:03 +0800 Subject: [PATCH 012/104] implemented #print_tree method --- html_parser.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/html_parser.rb b/html_parser.rb index 8502266..2c00d08 100644 --- a/html_parser.rb +++ b/html_parser.rb @@ -15,6 +15,18 @@ def initialize(html_string) build_tree(self.tokens.dup) end + def print_tree(start_node) + unless start_node.nil? + if start_node.type == :text + print "#{start_node.content} " + elsif start_node.type == :tag + puts "\n#{start_node.content}" + start_node.children.each { |child_node| print_tree(child_node) } + print "\n#{start_node.content[0]}/#{start_node.content[1..-1]}\n" + end + end + end + def build_tree(tokens) current_node = self.root @@ -112,4 +124,6 @@ def process_token(token) p html_parser.tokens p html_parser.root + + html_parser.print_tree(html_parser.root) end From 82696bb8c883d01a03e1d87a0f4c5e03fe509415 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 17 Apr 2017 17:10:46 +0800 Subject: [PATCH 013/104] finished Warmup 2 --- warmups/html_parser.rb | 129 +++++++++++++++++++++++++++++++++++++++++ warmups/tag_parser.rb | 53 +++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 warmups/html_parser.rb create mode 100644 warmups/tag_parser.rb diff --git a/warmups/html_parser.rb b/warmups/html_parser.rb new file mode 100644 index 0000000..2c00d08 --- /dev/null +++ b/warmups/html_parser.rb @@ -0,0 +1,129 @@ +class HTMLParser + Node = Struct.new(:type, :content, :attributes, :parent, :children) + + attr_reader :html_string, + :tokens, + :root + + def initialize(html_string) + @html_string = html_string + @tokens = tokenize(html_string) + @root = nil + + # pass in a dup of @tokens to ensure we don't modify @tokens + # while building the tree + build_tree(self.tokens.dup) + end + + def print_tree(start_node) + unless start_node.nil? + if start_node.type == :text + print "#{start_node.content} " + elsif start_node.type == :tag + puts "\n#{start_node.content}" + start_node.children.each { |child_node| print_tree(child_node) } + print "\n#{start_node.content[0]}/#{start_node.content[1..-1]}\n" + end + end + end + + def build_tree(tokens) + current_node = self.root + + until tokens.empty? + token = tokens.shift + + # if the token is text, add create a new :text Node + # and add it to the current node's children + if is_text?(token) + current_node.children << Node.new(:text, token, {}, current_node, []) + elsif is_open_tag?(token) + # create the root node if it does not exist yet + if current_node.nil? + @root = Node.new(:tag, token, {}, nil, []) + current_node = self.root + else + # we already have a Node (which may or may not be the root node), + # but now we have a new opening tag, so create a new Node as one of + # the current node's children, set the new node's parent to the + # current node, then change the current node + current_node.children << Node.new(:tag, token, {}, current_node, []) + current_node = current_node.children.last + end + elsif is_close_tag?(token) + # we found a closing tag, move the current node back + # to the current node's parent + current_node = current_node.parent + end + end + end + + def is_open_tag?(token) + regex = /^<(\w+)>$/ + !token.match(regex).nil? + end + + def is_close_tag?(token) + regex = /^<(\/\w+)>$/ + !token.match(regex).nil? + end + + def is_text?(token) + regex = /^[^<>\/]+$/ + !token.match(regex).nil? + end + + # split up the HTML string into individual tokens, + # so that we can create Nodes from them + def tokenize(html_string) + regex = /(?)/ + tokens = html_string.scan(regex).flatten + + # now process each token to ensure that it really is a distinct, + # separate token + tokens.each do |token| + processed_token = process_token(token) + tokens[tokens.index(token)] = processed_token if processed_token + end + + @tokens = tokens.flatten + end + + def process_token(token) + # we want to further check the token for edge cases + # e.g. a token could combine both a tag and content: + # after + # mid + # tag) + # we want to break these into two separate tokens, + # returning them as an array + + regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ + captures = token.match(regex).captures unless token.match(regex).nil? + + # check for text before/after a tag + unless captures.nil? + text_before_tag = captures[0] + text_after_tag = captures[1] + + if text_before_tag + tag = token.sub(text_before_tag, "") + return [text_before_tag, tag] + elsif text_after_tag + tag = token.sub(text_after_tag, "") + return [tag, text_after_tag] + end + end + end +end + +if __FILE__ == $0 + html_string = "
div text before

p text

more div text
div text after
" + + html_parser = HTMLParser.new(html_string) + + p html_parser.tokens + p html_parser.root + + html_parser.print_tree(html_parser.root) +end diff --git a/warmups/tag_parser.rb b/warmups/tag_parser.rb new file mode 100644 index 0000000..f67af3a --- /dev/null +++ b/warmups/tag_parser.rb @@ -0,0 +1,53 @@ +# it may be possible to use an OpenStruct instead, so that we can use dot +# notation to access the tag's attributes +# +# require 'ostruct' +# Tag = OpenStruct.new(:type, :class, :id, :name) +# tag = Tag.new +# tag.new_attr = new_value +# +# or we can create the OpenStruct by passing in a hash +# tag = OpenStruct.new(hash) + +class TagParser + def initialize; end + + # parse the tag, initialize a hash to store the tag's type/attributes + def parse_tag(tag) + tag_type = find_tag_type(tag) + tag_attributes = find_tag_attributes(tag) + + parsed_tag = {} + parsed_tag[tag_type.to_sym] = tag_type + + tag_attributes.each do |tag_attribute| + attr = tag_attribute[0] + value = tag_attribute[1].split + value = value.first if value.length == 1 + parsed_tag[attr.to_sym] = value + end + + parsed_tag + end + + private + + # find the tag type + def find_tag_type(tag) + type_regex = /^<(\w+)/ + type_match = tag.match(type_regex) + type_match.captures.first + end + + # find all the tag's attributes + def find_tag_attributes(tag) + attribute_regex = /(\w+)=['"](.+?)['"]/ + attribute_matches = tag.scan(attribute_regex) + end +end + +if __FILE__ == $0 + parser = TagParser.new + tag = parser.parse_tag("

") + p tag +end From 1ccd7dd49bd0bfeefcc25ee60a1f86a1a7b7da77 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 17 Apr 2017 17:11:30 +0800 Subject: [PATCH 014/104] moved files --- html_parser.rb | 129 ------------------------------------------------- tag_parser.rb | 53 -------------------- 2 files changed, 182 deletions(-) delete mode 100644 html_parser.rb delete mode 100644 tag_parser.rb diff --git a/html_parser.rb b/html_parser.rb deleted file mode 100644 index 2c00d08..0000000 --- a/html_parser.rb +++ /dev/null @@ -1,129 +0,0 @@ -class HTMLParser - Node = Struct.new(:type, :content, :attributes, :parent, :children) - - attr_reader :html_string, - :tokens, - :root - - def initialize(html_string) - @html_string = html_string - @tokens = tokenize(html_string) - @root = nil - - # pass in a dup of @tokens to ensure we don't modify @tokens - # while building the tree - build_tree(self.tokens.dup) - end - - def print_tree(start_node) - unless start_node.nil? - if start_node.type == :text - print "#{start_node.content} " - elsif start_node.type == :tag - puts "\n#{start_node.content}" - start_node.children.each { |child_node| print_tree(child_node) } - print "\n#{start_node.content[0]}/#{start_node.content[1..-1]}\n" - end - end - end - - def build_tree(tokens) - current_node = self.root - - until tokens.empty? - token = tokens.shift - - # if the token is text, add create a new :text Node - # and add it to the current node's children - if is_text?(token) - current_node.children << Node.new(:text, token, {}, current_node, []) - elsif is_open_tag?(token) - # create the root node if it does not exist yet - if current_node.nil? - @root = Node.new(:tag, token, {}, nil, []) - current_node = self.root - else - # we already have a Node (which may or may not be the root node), - # but now we have a new opening tag, so create a new Node as one of - # the current node's children, set the new node's parent to the - # current node, then change the current node - current_node.children << Node.new(:tag, token, {}, current_node, []) - current_node = current_node.children.last - end - elsif is_close_tag?(token) - # we found a closing tag, move the current node back - # to the current node's parent - current_node = current_node.parent - end - end - end - - def is_open_tag?(token) - regex = /^<(\w+)>$/ - !token.match(regex).nil? - end - - def is_close_tag?(token) - regex = /^<(\/\w+)>$/ - !token.match(regex).nil? - end - - def is_text?(token) - regex = /^[^<>\/]+$/ - !token.match(regex).nil? - end - - # split up the HTML string into individual tokens, - # so that we can create Nodes from them - def tokenize(html_string) - regex = /(?)/ - tokens = html_string.scan(regex).flatten - - # now process each token to ensure that it really is a distinct, - # separate token - tokens.each do |token| - processed_token = process_token(token) - tokens[tokens.index(token)] = processed_token if processed_token - end - - @tokens = tokens.flatten - end - - def process_token(token) - # we want to further check the token for edge cases - # e.g. a token could combine both a tag and content: - # after - # mid - # tag) - # we want to break these into two separate tokens, - # returning them as an array - - regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ - captures = token.match(regex).captures unless token.match(regex).nil? - - # check for text before/after a tag - unless captures.nil? - text_before_tag = captures[0] - text_after_tag = captures[1] - - if text_before_tag - tag = token.sub(text_before_tag, "") - return [text_before_tag, tag] - elsif text_after_tag - tag = token.sub(text_after_tag, "") - return [tag, text_after_tag] - end - end - end -end - -if __FILE__ == $0 - html_string = "

div text before

p text

more div text
div text after
" - - html_parser = HTMLParser.new(html_string) - - p html_parser.tokens - p html_parser.root - - html_parser.print_tree(html_parser.root) -end diff --git a/tag_parser.rb b/tag_parser.rb deleted file mode 100644 index f67af3a..0000000 --- a/tag_parser.rb +++ /dev/null @@ -1,53 +0,0 @@ -# it may be possible to use an OpenStruct instead, so that we can use dot -# notation to access the tag's attributes -# -# require 'ostruct' -# Tag = OpenStruct.new(:type, :class, :id, :name) -# tag = Tag.new -# tag.new_attr = new_value -# -# or we can create the OpenStruct by passing in a hash -# tag = OpenStruct.new(hash) - -class TagParser - def initialize; end - - # parse the tag, initialize a hash to store the tag's type/attributes - def parse_tag(tag) - tag_type = find_tag_type(tag) - tag_attributes = find_tag_attributes(tag) - - parsed_tag = {} - parsed_tag[tag_type.to_sym] = tag_type - - tag_attributes.each do |tag_attribute| - attr = tag_attribute[0] - value = tag_attribute[1].split - value = value.first if value.length == 1 - parsed_tag[attr.to_sym] = value - end - - parsed_tag - end - - private - - # find the tag type - def find_tag_type(tag) - type_regex = /^<(\w+)/ - type_match = tag.match(type_regex) - type_match.captures.first - end - - # find all the tag's attributes - def find_tag_attributes(tag) - attribute_regex = /(\w+)=['"](.+?)['"]/ - attribute_matches = tag.scan(attribute_regex) - end -end - -if __FILE__ == $0 - parser = TagParser.new - tag = parser.parse_tag("

") - p tag -end From 317c12224d05d22e24f566a9211d4b4d36e1a969 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 17 Apr 2017 18:27:24 +0800 Subject: [PATCH 015/104] completed Warmup 3 --- warmups/design_pseudocode.md | 51 ++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 warmups/design_pseudocode.md diff --git a/warmups/design_pseudocode.md b/warmups/design_pseudocode.md new file mode 100644 index 0000000..8982e00 --- /dev/null +++ b/warmups/design_pseudocode.md @@ -0,0 +1,51 @@ +### Pseudocoding the design + +``` +# DOMReader +Create an instance of DOMReader +Read in the provided HTML file +Parse it into individual tokens using regular expressions +Process each token to ensure that each edge case is handled: + - a tag with text before it, e.g. text, or text + - a tag with text after it, e.g. blah, or bar +Ensure that these tokens are processed and further broken down +Store all tokens in an array + +# DOMParser +Pass the token array to an instance of DOMParser +Implement a Node struct, with these attributes: + - :type, for the type of tag + - :content, to store the text content + - :attributes, an array for this tag's attributes, if any + - :depth, for the depth of this node in the tree + - :parent, for this node's parent node + - :children, an array to store any children this node might have + +Initialize a depth counter to 0 +Set the current node to nil +Dequeue the first element of the token array, and set it as the current token +If the current token is an opening tag + Create a new Node struct + Parse the token's attributes, if any + Set the Node's type, attributes, depth accordingly + If the root of the tree is nil + Set the Node's parent to nil + Set the Node's children to an empty array + Set the current node to the root Node + Else + Add this Node as the last child of the current Node's children + Set this Node's parent to the current Node + Set the current Node to this Node + Increment the current depth +If the current token is just text + Create a new Node + Set the Node's type to :text + Set the Node's content accordingly + Set the Node's parent to the current node +If the current token is a closing tag + Set the current node to the current node's parent + Decrement the current depth +Continue dequeueing and processing tokens from the array until the array is empty +Return the built tree + +``` From ed28afbc1f734d6dfec75594d42cca556790486b Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 17 Apr 2017 18:27:45 +0800 Subject: [PATCH 016/104] init RSpec --- .rspec | 3 ++ Gemfile | 6 +++ Gemfile.lock | 65 ++++++++++++++++++++++++++++ spec/spec_helper.rb | 103 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 177 insertions(+) create mode 100644 .rspec create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 spec/spec_helper.rb diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..7e3eb5c --- /dev/null +++ b/.rspec @@ -0,0 +1,3 @@ +--color +--require spec_helper +--format doc diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..ce3fe7f --- /dev/null +++ b/Gemfile @@ -0,0 +1,6 @@ +# Gemfile + +source 'https://rubygems.org' + +gem 'rspec', '~> 3.5.0' +gem 'guard-rspec', '~> 4.7.3', require: false \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..3874780 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,65 @@ +GEM + remote: https://rubygems.org/ + specs: + coderay (1.1.1) + diff-lcs (1.3) + ffi (1.9.18) + formatador (0.2.5) + guard (2.14.1) + formatador (>= 0.2.4) + listen (>= 2.7, < 4.0) + lumberjack (~> 1.0) + nenv (~> 0.1) + notiffany (~> 0.0) + pry (>= 0.9.12) + shellany (~> 0.0) + thor (>= 0.18.1) + guard-compat (1.2.1) + guard-rspec (4.7.3) + guard (~> 2.1) + guard-compat (~> 1.1) + rspec (>= 2.99.0, < 4.0) + listen (3.1.5) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + ruby_dep (~> 1.2) + lumberjack (1.0.11) + method_source (0.8.2) + nenv (0.3.0) + notiffany (0.1.1) + nenv (~> 0.1) + shellany (~> 0.0) + pry (0.10.4) + coderay (~> 1.1.0) + method_source (~> 0.8.1) + slop (~> 3.4) + rb-fsevent (0.9.8) + rb-inotify (0.9.8) + ffi (>= 0.5.0) + rspec (3.5.0) + rspec-core (~> 3.5.0) + rspec-expectations (~> 3.5.0) + rspec-mocks (~> 3.5.0) + rspec-core (3.5.4) + rspec-support (~> 3.5.0) + rspec-expectations (3.5.0) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.5.0) + rspec-mocks (3.5.0) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.5.0) + rspec-support (3.5.0) + ruby_dep (1.5.0) + shellany (0.0.1) + slop (3.6.0) + thor (0.19.4) + +PLATFORMS + ruby + +DEPENDENCIES + guard-rspec (~> 4.7.3) + rspec (~> 3.5.0) + +BUNDLED WITH + 1.14.4 diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..47b39ce --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,103 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# The `.rspec` file also contains a few flags that are not defaults but that +# users commonly want. +# +# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + +# The settings below are suggested to provide a good initial experience +# with RSpec, but feel free to customize to your heart's content. +=begin + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ + # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ + # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = 'doc' + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +=end +end From 5f6a82ebc6d02b07c2ff021b042ebc60ac03b6b6 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 13:02:57 +0800 Subject: [PATCH 017/104] created files for DOMReader class --- dom_reader.rb | 0 spec/dom_reader_spec.rb | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 dom_reader.rb create mode 100644 spec/dom_reader_spec.rb diff --git a/dom_reader.rb b/dom_reader.rb new file mode 100644 index 0000000..e69de29 diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb new file mode 100644 index 0000000..e69de29 From a69e73070d8d2b148b8137394cefec09dde73b00 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 13:05:10 +0800 Subject: [PATCH 018/104] added Guardfile --- Guardfile | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 Guardfile diff --git a/Guardfile b/Guardfile new file mode 100644 index 0000000..3215f01 --- /dev/null +++ b/Guardfile @@ -0,0 +1,70 @@ +# A sample Guardfile +# More info at https://github.com/guard/guard#readme + +## Uncomment and set this to only include directories you want to watch +# directories %w(app lib config test spec features) \ +# .select{|d| Dir.exists?(d) ? d : UI.warning("Directory #{d} does not exist")} + +## Note: if you are using the `directories` clause above and you are not +## watching the project directory ('.'), then you will want to move +## the Guardfile to a watched dir and symlink it back, e.g. +# +# $ mkdir config +# $ mv Guardfile config/ +# $ ln -s config/Guardfile . +# +# and, you'll have to watch "config/Guardfile" instead of "Guardfile" + +# Note: The cmd option is now required due to the increasing number of ways +# rspec may be run, below are examples of the most common uses. +# * bundler: 'bundle exec rspec' +# * bundler binstubs: 'bin/rspec' +# * spring: 'bin/rspec' (This will use spring if running and you have +# installed the spring binstubs per the docs) +# * zeus: 'zeus rspec' (requires the server to be started separately) +# * 'just' rspec: 'rspec' + +guard :rspec, cmd: "bundle exec rspec" do + require "guard/rspec/dsl" + dsl = Guard::RSpec::Dsl.new(self) + + # Feel free to open issues for suggestions and improvements + + # RSpec files + rspec = dsl.rspec + watch(rspec.spec_helper) { rspec.spec_dir } + watch(rspec.spec_support) { rspec.spec_dir } + watch(rspec.spec_files) + + # Ruby files + ruby = dsl.ruby + dsl.watch_spec_files_for(ruby.lib_files) + + # Rails files + rails = dsl.rails(view_extensions: %w(erb haml slim)) + dsl.watch_spec_files_for(rails.app_files) + dsl.watch_spec_files_for(rails.views) + + watch(rails.controllers) do |m| + [ + rspec.spec.call("routing/#{m[1]}_routing"), + rspec.spec.call("controllers/#{m[1]}_controller"), + rspec.spec.call("acceptance/#{m[1]}") + ] + end + + # Rails config changes + watch(rails.spec_helper) { rspec.spec_dir } + watch(rails.routes) { "#{rspec.spec_dir}/routing" } + watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" } + + # Capybara features specs + watch(rails.view_dirs) { |m| rspec.spec.call("features/#{m[1]}") } + watch(rails.layouts) { |m| rspec.spec.call("features/#{m[1]}") } + + # Turnip features and steps + watch(%r{^spec/acceptance/(.+)\.feature$}) + watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) do |m| + Dir[File.join("**/#{m[1]}.feature")][0] || "spec/acceptance" + end +end From e225dcb39305cbd9864940d24d3d2851825612ed Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 13:13:36 +0800 Subject: [PATCH 019/104] moved file --- dom_reader.rb | 0 lib/dom_reader.rb | 2 ++ 2 files changed, 2 insertions(+) delete mode 100644 dom_reader.rb create mode 100644 lib/dom_reader.rb diff --git a/dom_reader.rb b/dom_reader.rb deleted file mode 100644 index e69de29..0000000 diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb new file mode 100644 index 0000000..33fc25c --- /dev/null +++ b/lib/dom_reader.rb @@ -0,0 +1,2 @@ +class DOMReader +end From 55e8911d130e331369134e421a91fdbb3568f8ac Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 13:43:31 +0800 Subject: [PATCH 020/104] defined initial specs --- spec/dom_reader_spec.rb | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index e69de29..666311b 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -0,0 +1,15 @@ +# spec/dom_reader_spec.rb + +require 'dom_reader' + +describe "DOMReader" do + describe "#initialize" do + it "creates an instance of DOMReader" + end + + describe "#read_file" do + it "raises an error if the file to be read does not exist" + + it "returns the file's contents as an array" + end +end From c4e7fb07bf1ceaf7b12364655df9edc2f0cae309 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 13:48:15 +0800 Subject: [PATCH 021/104] added specs/behavior for #initialize --- lib/dom_reader.rb | 1 + spec/dom_reader_spec.rb | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 33fc25c..24a3d5b 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -1,2 +1,3 @@ class DOMReader + def initialize; end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 666311b..f86f48a 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -4,7 +4,9 @@ describe "DOMReader" do describe "#initialize" do - it "creates an instance of DOMReader" + it "creates an instance of DOMReader" do + expect(DOMReader.new).to be_a(DOMReader) + end end describe "#read_file" do From bb246088729c91a72922047053db9dafb0819a54 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 13:56:01 +0800 Subject: [PATCH 022/104] added hook, modified spec for #initialize --- spec/dom_reader_spec.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index f86f48a..caaf8bf 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -3,9 +3,11 @@ require 'dom_reader' describe "DOMReader" do + let(:dom_reader) { DOMReader.new } + describe "#initialize" do it "creates an instance of DOMReader" do - expect(DOMReader.new).to be_a(DOMReader) + expect(dom_reader).to be_a(DOMReader) end end From 10260446b57952192976dd723267ebd91925fff7 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 14:01:48 +0800 Subject: [PATCH 023/104] added specs/behavior for #read_file --- lib/dom_reader.rb | 6 ++++++ spec/dom_reader_spec.rb | 9 +++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 24a3d5b..827c373 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -1,3 +1,9 @@ class DOMReader def initialize; end + + def read_file(filename) + raise "The file to be read does not exist." unless File.exist?(filename) + + contents = File.readlines(filename) + end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index caaf8bf..1c3a64d 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -12,8 +12,13 @@ end describe "#read_file" do - it "raises an error if the file to be read does not exist" + it "raises an error if the file to be read does not exist" do + expect { dom_reader.read_file("blah") }.to raise_error(/file to be read does not exist/) + end - it "returns the file's contents as an array" + it "returns the file's contents as an array" do + test_file = './test.html' + expect(dom_reader.read_file(test_file)).to be_a(Array) + end end end From 45f9bc3a85ec9875b2681843d50d7da233412a41 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 14:12:47 +0800 Subject: [PATCH 024/104] define specs to check for various types of text in our file --- spec/dom_reader_spec.rb | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 1c3a64d..5c50ad7 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -21,4 +21,28 @@ expect(dom_reader.read_file(test_file)).to be_a(Array) end end + + describe "#is_open_tag?" do + it "returns true if the argument is an opening tag" + + it "returns false otherwise" + end + + describe "#is_close_tag?" do + it "returns true if the argument is a closing tag" + + it "returns false otherwise" + end + + describe "#is_text?" do + it "returns true if the argument is just text (not a tag)" + + it "returns false otherwise" + end + + describe "#is_mixed_content?" do + it "returns true if the argument contains both a tag and text" + + it "returns false otherwise" + end end From b374d1f527eefceaccae736b36a97db0b6c94e57 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 14:24:24 +0800 Subject: [PATCH 025/104] implemented specs/behavior for #is_open_tag? method --- lib/dom_reader.rb | 5 +++++ spec/dom_reader_spec.rb | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 827c373..5bc47d0 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -6,4 +6,9 @@ def read_file(filename) contents = File.readlines(filename) end + + def is_open_tag?(text) + regex = /^<(\w+)>$/ + !text.match(regex).nil? + end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 5c50ad7..b8c1338 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -23,9 +23,14 @@ end describe "#is_open_tag?" do - it "returns true if the argument is an opening tag" + it "returns true if the argument is an opening tag" do + expect(dom_reader.is_open_tag?("")).to be true + end - it "returns false otherwise" + it "returns false otherwise" do + expect(dom_reader.is_open_tag?("text")).to be false + expect(dom_reader.is_open_tag?("")).to be false + end end describe "#is_close_tag?" do From fd4053e2cd496bb97f318cee4de3abcd7ad255f2 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 14:35:14 +0800 Subject: [PATCH 026/104] implemented specs/behavior for #is_close_tag? method --- lib/dom_reader.rb | 5 +++++ spec/dom_reader_spec.rb | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 5bc47d0..bba5fb4 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -11,4 +11,9 @@ def is_open_tag?(text) regex = /^<(\w+)>$/ !text.match(regex).nil? end + + def is_close_tag?(text) + regex = /^<(\/\w+)>$/ + !text.match(regex).nil? + end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index b8c1338..ccd0d01 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -34,9 +34,14 @@ end describe "#is_close_tag?" do - it "returns true if the argument is a closing tag" + it "returns true if the argument is a closing tag" do + expect(dom_reader.is_close_tag?("")).to be true + end - it "returns false otherwise" + it "returns false otherwise" do + expect(dom_reader.is_close_tag?("")).to be false + expect(dom_reader.is_close_tag?("text")).to be false + end end describe "#is_text?" do From bf07b5ae1fd4fb15d96b34329d11061b669eb39f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Tue, 18 Apr 2017 14:43:37 +0800 Subject: [PATCH 027/104] implemented specs/behavior for #is_text? method --- lib/dom_reader.rb | 5 +++++ spec/dom_reader_spec.rb | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index bba5fb4..c75308b 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -16,4 +16,9 @@ def is_close_tag?(text) regex = /^<(\/\w+)>$/ !text.match(regex).nil? end + + def is_text?(text) + regex = /^[^<>\/]+$/ + !text.match(regex).nil? + end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index ccd0d01..549425d 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -45,9 +45,14 @@ end describe "#is_text?" do - it "returns true if the argument is just text (not a tag)" + it "returns true if the argument is just text (not a tag)" do + expect(dom_reader.is_text?("text")).to be true + end - it "returns false otherwise" + it "returns false otherwise" do + expect(dom_reader.is_text?("")).to be false + expect(dom_reader.is_text?("")).to be false + end end describe "#is_mixed_content?" do From a345197d450252ea821b9a96ea2cecdb04837b1f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Wed, 19 Apr 2017 12:47:13 +0800 Subject: [PATCH 028/104] implemented specs/behavior for #is_mixed_content? method --- lib/dom_reader.rb | 5 +++++ spec/dom_reader_spec.rb | 13 +++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index c75308b..d88a12d 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -21,4 +21,9 @@ def is_text?(text) regex = /^[^<>\/]+$/ !text.match(regex).nil? end + + def is_mixed_content?(text) + regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ + !text.match(regex).nil? + end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 549425d..5292a2a 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -56,8 +56,17 @@ end describe "#is_mixed_content?" do - it "returns true if the argument contains both a tag and text" + it "returns true if the argument contains both a tag and text" do + expect(dom_reader.is_mixed_content?("text")).to be true + expect(dom_reader.is_mixed_content?("text")).to be true + expect(dom_reader.is_mixed_content?("text")).to be true + expect(dom_reader.is_mixed_content?("text")).to be true + end - it "returns false otherwise" + it "returns false otherwise" do + expect(dom_reader.is_mixed_content?("")).to be false + expect(dom_reader.is_mixed_content?("")).to be false + expect(dom_reader.is_mixed_content?("text")).to be false + end end end From 4c259ac405509205ce11a3cc4aac0957eb342fe1 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Wed, 19 Apr 2017 13:03:38 +0800 Subject: [PATCH 029/104] implemented specs/behavior for #tokenize method --- lib/dom_reader.rb | 5 +++++ spec/dom_reader_spec.rb | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index d88a12d..a0729cd 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -26,4 +26,9 @@ def is_mixed_content?(text) regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ !text.match(regex).nil? end + + def tokenize(text) + regex = /(?)/ + tokens = text.scan(regex).flatten + end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 5292a2a..36f27b1 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -69,4 +69,17 @@ expect(dom_reader.is_mixed_content?("text")).to be false end end + + describe "#tokenize" do + it "breaks down the input string into tokens" do + test_string = "This document contains data\n" + tokens = dom_reader.tokenize(test_string) + + expect(tokens.length).to eq(4) + expect(tokens[0]).to eq("This") + expect(tokens[1]).to eq("document") + expect(tokens[2]).to eq("contains") + expect(tokens[3]).to eq("data\n") + end + end end From b97eb4f783927d58004aaf3ff067fe22a91c677a Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Wed, 19 Apr 2017 14:31:13 +0800 Subject: [PATCH 030/104] implemented specs/behavior for #split_tag_and_text method --- lib/dom_reader.rb | 18 +++++++++++++++++ spec/dom_reader_spec.rb | 44 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index a0729cd..c430db5 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -31,4 +31,22 @@ def tokenize(text) regex = /(?)/ tokens = text.scan(regex).flatten end + + def split_tag_and_text(token) + return nil unless is_mixed_content?(token) + + regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ + captures = token.match(regex).captures + + text_before_tag = captures[0] + text_after_tag = captures[1] + + if text_before_tag + tag = token.sub(text_before_tag, "") + [text_before_tag, tag] + elsif text_after_tag + tag = token.sub(text_after_tag, "") + [tag, text_after_tag] + end + end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 36f27b1..5bb1be2 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -82,4 +82,48 @@ expect(tokens[3]).to eq("data\n") end end + + describe "#split_tag_and_text" do + it "returns nil if the token to be split does not have mixed content" do + expect(dom_reader.split_tag_and_text("text")).to be_nil + expect(dom_reader.split_tag_and_text("")).to be_nil + expect(dom_reader.split_tag_and_text("")).to be_nil + end + + it "correctly splits a token that has a start tag, then text" do + test_token = "text" + tokens = dom_reader.split_tag_and_text(test_token) + + expect(tokens.length).to eq(2) + expect(tokens[0]).to eq("") + expect(tokens[1]).to eq("text") + end + + it "correctly splits a token that has an end tag, then text" do + test_token = "text" + tokens = dom_reader.split_tag_and_text(test_token) + + expect(tokens.length).to eq(2) + expect(tokens[0]).to eq("") + expect(tokens[1]).to eq("text") + end + + it "correctly splits a token that has text, then a start tag" do + test_token = "text" + tokens = dom_reader.split_tag_and_text(test_token) + + expect(tokens.length).to eq(2) + expect(tokens[0]).to eq("text") + expect(tokens[1]).to eq("") + end + + it "correctly splits a token that has text, then an end tag" do + test_token = "text" + tokens = dom_reader.split_tag_and_text(test_token) + + expect(tokens.length).to eq(2) + expect(tokens[0]).to eq("text") + expect(tokens[1]).to eq("") + end + end end From 81eb772350b59e2934a8c33c90f777ff43fd99f0 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 12:52:44 +0800 Subject: [PATCH 031/104] added specs for tokenizing mixed content --- spec/dom_reader_spec.rb | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 5bb1be2..71220fb 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -81,6 +81,17 @@ expect(tokens[2]).to eq("contains") expect(tokens[3]).to eq("data\n") end + + it "correctly tokenizes mixed content" do + test_string = "some text" + tokens = dom_reader.tokenize(test_string) + + expect(tokens.length).to eq(4) + expect(tokens[0]).to eq("") + expect(tokens[1]).to eq("some") + expect(tokens[2]).to eq("text") + expect(tokens[3]).to eq(" Date: Thu, 20 Apr 2017 12:53:20 +0800 Subject: [PATCH 032/104] first attempt at tokenizing mixed content --- lib/dom_reader.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index c430db5..ec5de77 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -30,6 +30,15 @@ def is_mixed_content?(text) def tokenize(text) regex = /(?)/ tokens = text.scan(regex).flatten + + tokens.each do |token| + if is_mixed_content?(token) + new_tokens = split_tag_and_text(token) + tokens[tokens.index(token)] = new_tokens + end + end + + tokens.flatten end def split_tag_and_text(token) From b38bfcbaf10a1a3f1d52094ea5f6bef7fd53de5f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 13:02:42 +0800 Subject: [PATCH 033/104] modified specs/behavior for #is_open_tag? to check for tag attributes --- lib/dom_reader.rb | 3 ++- spec/dom_reader_spec.rb | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index ec5de77..00b04b3 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -8,7 +8,8 @@ def read_file(filename) end def is_open_tag?(text) - regex = /^<(\w+)>$/ + # regex = /^<(\w+)>$/ + regex = /^<(\w+)\s*(.+)*>$/ !text.match(regex).nil? end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 71220fb..dea9ded 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -25,6 +25,8 @@ describe "#is_open_tag?" do it "returns true if the argument is an opening tag" do expect(dom_reader.is_open_tag?("")).to be true + expect(dom_reader.is_open_tag?("")).to be true + expect(dom_reader.is_open_tag?("")).to be true end it "returns false otherwise" do From acddb2f788d8ccd11f1e8e7edc0c8da51fbd7160 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 13:09:34 +0800 Subject: [PATCH 034/104] modified specs/behavior for #is_mixed_content? to check for tag attributes --- lib/dom_reader.rb | 3 ++- spec/dom_reader_spec.rb | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 00b04b3..8c755fd 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -24,7 +24,8 @@ def is_text?(text) end def is_mixed_content?(text) - regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ + # regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ + regex = /(.+)<\/?\w+\s*.*>|<\/?\w+\s*.*>(.+)/ !text.match(regex).nil? end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index dea9ded..a7e12e6 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -63,6 +63,11 @@ expect(dom_reader.is_mixed_content?("text")).to be true expect(dom_reader.is_mixed_content?("text")).to be true expect(dom_reader.is_mixed_content?("text")).to be true + + expect(dom_reader.is_mixed_content?("text")).to be true + expect(dom_reader.is_mixed_content?("text")).to be true + expect(dom_reader.is_mixed_content?("text")).to be true + expect(dom_reader.is_mixed_content?("text")).to be true end it "returns false otherwise" do From e39ba23ff8cdaf3eccacdfd6ec6287a31a823b01 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 13:35:49 +0800 Subject: [PATCH 035/104] additional spec for #is_text? method --- spec/dom_reader_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index a7e12e6..0e95600 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -49,6 +49,7 @@ describe "#is_text?" do it "returns true if the argument is just text (not a tag)" do expect(dom_reader.is_text?("text")).to be true + expect(dom_reader.is_text?("multiple words here!!!")).to be true end it "returns false otherwise" do @@ -65,7 +66,6 @@ expect(dom_reader.is_mixed_content?("text")).to be true expect(dom_reader.is_mixed_content?("text")).to be true - expect(dom_reader.is_mixed_content?("text")).to be true expect(dom_reader.is_mixed_content?("text")).to be true expect(dom_reader.is_mixed_content?("text")).to be true end From 22c64cb89f0c43b5b741a5c326655a6460c56dfe Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 13:41:48 +0800 Subject: [PATCH 036/104] reworked specs / regex for #tokenize method --- lib/dom_reader.rb | 12 +++--------- spec/dom_reader_spec.rb | 14 +++++--------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 8c755fd..70ef5ff 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -30,17 +30,11 @@ def is_mixed_content?(text) end def tokenize(text) - regex = /(?)/ + # regex = /(?)/ + regex = /(<.+?>|[^<>]+|<\/\w+?>)/ tokens = text.scan(regex).flatten - tokens.each do |token| - if is_mixed_content?(token) - new_tokens = split_tag_and_text(token) - tokens[tokens.index(token)] = new_tokens - end - end - - tokens.flatten + tokens end def split_tag_and_text(token) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 0e95600..ba73437 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -82,22 +82,18 @@ test_string = "This document contains data\n" tokens = dom_reader.tokenize(test_string) - expect(tokens.length).to eq(4) - expect(tokens[0]).to eq("This") - expect(tokens[1]).to eq("document") - expect(tokens[2]).to eq("contains") - expect(tokens[3]).to eq("data\n") + expect(tokens.length).to eq(1) + expect(tokens[0]).to eq("This document contains data\n") end it "correctly tokenizes mixed content" do test_string = "some text" tokens = dom_reader.tokenize(test_string) - expect(tokens.length).to eq(4) + expect(tokens.length).to eq(3) expect(tokens[0]).to eq("") - expect(tokens[1]).to eq("some") - expect(tokens[2]).to eq("text") - expect(tokens[3]).to eq("") end end From cf4380d48b43c095014f51092b99b6100f2a2e09 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 13:43:31 +0800 Subject: [PATCH 037/104] removed unnecessary specs/methods --- lib/dom_reader.rb | 25 ---------------- spec/dom_reader_spec.rb | 63 ----------------------------------------- 2 files changed, 88 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 70ef5ff..f60d8cb 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -23,35 +23,10 @@ def is_text?(text) !text.match(regex).nil? end - def is_mixed_content?(text) - # regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ - regex = /(.+)<\/?\w+\s*.*>|<\/?\w+\s*.*>(.+)/ - !text.match(regex).nil? - end - def tokenize(text) - # regex = /(?)/ regex = /(<.+?>|[^<>]+|<\/\w+?>)/ tokens = text.scan(regex).flatten tokens end - - def split_tag_and_text(token) - return nil unless is_mixed_content?(token) - - regex = /(.+)<\/?\w+>|<\/?\w+>(.+)/ - captures = token.match(regex).captures - - text_before_tag = captures[0] - text_after_tag = captures[1] - - if text_before_tag - tag = token.sub(text_before_tag, "") - [text_before_tag, tag] - elsif text_after_tag - tag = token.sub(text_after_tag, "") - [tag, text_after_tag] - end - end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index ba73437..e3caed2 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -58,25 +58,6 @@ end end - describe "#is_mixed_content?" do - it "returns true if the argument contains both a tag and text" do - expect(dom_reader.is_mixed_content?("text")).to be true - expect(dom_reader.is_mixed_content?("text")).to be true - expect(dom_reader.is_mixed_content?("text")).to be true - expect(dom_reader.is_mixed_content?("text")).to be true - - expect(dom_reader.is_mixed_content?("text")).to be true - expect(dom_reader.is_mixed_content?("text")).to be true - expect(dom_reader.is_mixed_content?("text")).to be true - end - - it "returns false otherwise" do - expect(dom_reader.is_mixed_content?("")).to be false - expect(dom_reader.is_mixed_content?("")).to be false - expect(dom_reader.is_mixed_content?("text")).to be false - end - end - describe "#tokenize" do it "breaks down the input string into tokens" do test_string = "This document contains data\n" @@ -96,48 +77,4 @@ expect(tokens[2]).to eq("") end end - - describe "#split_tag_and_text" do - it "returns nil if the token to be split does not have mixed content" do - expect(dom_reader.split_tag_and_text("text")).to be_nil - expect(dom_reader.split_tag_and_text("")).to be_nil - expect(dom_reader.split_tag_and_text("")).to be_nil - end - - it "correctly splits a token that has a start tag, then text" do - test_token = "text" - tokens = dom_reader.split_tag_and_text(test_token) - - expect(tokens.length).to eq(2) - expect(tokens[0]).to eq("") - expect(tokens[1]).to eq("text") - end - - it "correctly splits a token that has an end tag, then text" do - test_token = "text" - tokens = dom_reader.split_tag_and_text(test_token) - - expect(tokens.length).to eq(2) - expect(tokens[0]).to eq("") - expect(tokens[1]).to eq("text") - end - - it "correctly splits a token that has text, then a start tag" do - test_token = "text" - tokens = dom_reader.split_tag_and_text(test_token) - - expect(tokens.length).to eq(2) - expect(tokens[0]).to eq("text") - expect(tokens[1]).to eq("") - end - - it "correctly splits a token that has text, then an end tag" do - test_token = "text" - tokens = dom_reader.split_tag_and_text(test_token) - - expect(tokens.length).to eq(2) - expect(tokens[0]).to eq("text") - expect(tokens[1]).to eq("") - end - end end From f7db9b3036bd0bad227589e68b2353a5d77ee085 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 13:48:15 +0800 Subject: [PATCH 038/104] renamed spec --- spec/dom_reader_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index e3caed2..ef300c7 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -67,7 +67,7 @@ expect(tokens[0]).to eq("This document contains data\n") end - it "correctly tokenizes mixed content" do + it "correctly tokenizes mixed content on the same line" do test_string = "some text" tokens = dom_reader.tokenize(test_string) From 93e35a403ed4c49f1966562a078ef17442b9a780 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 14:08:45 +0800 Subject: [PATCH 039/104] added specs/behavior for #remove_doctype method --- lib/dom_reader.rb | 5 +++++ spec/dom_reader_spec.rb | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index f60d8cb..77a5da7 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -29,4 +29,9 @@ def tokenize(text) tokens end + + def remove_doctype(text) + regex = /\s*?/i + text.gsub(regex, "") + end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index ef300c7..7bd5a69 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -77,4 +77,11 @@ expect(tokens[2]).to eq("") end end + + describe "#remove_doctype" do + it "removes the doctype declaration from the input text, if it exists" do + test_string = "\n\n" + expect(dom_reader.remove_doctype(test_string)).to eq "\n\n" + end + end end From 8c123b74c99339d13b08f2dbc7975b3970cf2848 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 14:11:21 +0800 Subject: [PATCH 040/104] refactored #read_file method --- lib/dom_reader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 77a5da7..171048e 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -4,7 +4,7 @@ def initialize; end def read_file(filename) raise "The file to be read does not exist." unless File.exist?(filename) - contents = File.readlines(filename) + File.readlines(filename) end def is_open_tag?(text) From 73553d23ae9392dd9291bc09743e97676f91be93 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 14:19:49 +0800 Subject: [PATCH 041/104] defined initial specs for #token_type method --- spec/dom_reader_spec.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 7bd5a69..f197381 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -84,4 +84,14 @@ expect(dom_reader.remove_doctype(test_string)).to eq "\n\n" end end + + describe "#token_type" do + it "returns :open_tag if the input is an opening tag" + + it "returns :close_tag if the input is a closing tag" + + it "returns :text if the input is just text" + + it "returns :unknown otherwise" + end end From b6d8db388bfc2d86f0199f6928924f3fce27b596 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 14:30:29 +0800 Subject: [PATCH 042/104] added example.rb to run test code --- example.rb | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 example.rb diff --git a/example.rb b/example.rb new file mode 100644 index 0000000..c72f0e3 --- /dev/null +++ b/example.rb @@ -0,0 +1,13 @@ +if $0 == __FILE__ + require_relative './lib/dom_reader' + + dom_reader = DOMReader.new + contents = dom_reader.read_file("./test.html") + + contents.each do |line| + tokens = dom_reader.tokenize(line) + tokens.each do |token| + + end + end +end From 8524d2ab3bee51070fca842d13bf5455c8827795 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 14:30:38 +0800 Subject: [PATCH 043/104] added specs/behavior for #token_type --- lib/dom_reader.rb | 12 ++++++++++++ spec/dom_reader_spec.rb | 29 +++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 171048e..49546b6 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -30,6 +30,18 @@ def tokenize(text) tokens end + def token_type(token) + return :unknown unless token.is_a?(String) + + return :open_tag if is_open_tag?(token) + + return :close_tag if is_close_tag?(token) + + return :text if is_text?(token) + + :unknown + end + def remove_doctype(text) regex = /\s*?/i text.gsub(regex, "") diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index f197381..d97f17d 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -86,12 +86,33 @@ end describe "#token_type" do - it "returns :open_tag if the input is an opening tag" + it "returns :open_tag if the input is an opening tag" do + token = "" + expect(dom_reader.token_type(token)).to eq(:open_tag) - it "returns :close_tag if the input is a closing tag" + token = "" + expect(dom_reader.token_type(token)).to eq(:open_tag) + end + + it "returns :close_tag if the input is a closing tag" do + token = "" + expect(dom_reader.token_type(token)).to eq(:close_tag) + end - it "returns :text if the input is just text" + it "returns :text if the input is just text" do + token = "plaintext" + expect(dom_reader.token_type(token)).to eq(:text) - it "returns :unknown otherwise" + token = "This is some mixed text!! Blah blah 234 hrgargh" + expect(dom_reader.token_type(token)).to eq(:text) + end + + it "returns :unknown otherwise" do + token = ["an", "array"] + expect(dom_reader.token_type(token)).to eq(:unknown) + + token = "foobar Date: Thu, 20 Apr 2017 16:40:12 +0800 Subject: [PATCH 044/104] updated test code --- example.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/example.rb b/example.rb index c72f0e3..dd79ee8 100644 --- a/example.rb +++ b/example.rb @@ -3,11 +3,15 @@ dom_reader = DOMReader.new contents = dom_reader.read_file("./test.html") + contents[0] = dom_reader.remove_doctype(contents[0]) + + # looks like tokens are being split out correctly + # contiguous whitespace / newlines are being parsed as tokens too contents.each do |line| tokens = dom_reader.tokenize(line) tokens.each do |token| - + p "token: #{token}, type: #{dom_reader.token_type(token)}" end end end From 2164b6ddffd92445f12e08a26007a3459acb52fa Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 16:47:03 +0800 Subject: [PATCH 045/104] initial specs for DOMTree class --- spec/dom_tree_spec.rb | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 spec/dom_tree_spec.rb diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb new file mode 100644 index 0000000..a1dc765 --- /dev/null +++ b/spec/dom_tree_spec.rb @@ -0,0 +1,9 @@ +# spec/dom_tree_spec.rb + +require 'dom_tree' + +describe "DOMTree" do + describe "#initialize" do + it "creates an instance of DOMTree" + end +end From 50168783f008ffebf5ca49aab07cc92055677699 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 16:47:34 +0800 Subject: [PATCH 046/104] initial DOMTree class definition --- lib/dom_tree.rb | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 lib/dom_tree.rb diff --git a/lib/dom_tree.rb b/lib/dom_tree.rb new file mode 100644 index 0000000..37b31f9 --- /dev/null +++ b/lib/dom_tree.rb @@ -0,0 +1,3 @@ +class DOMTree + def initialize; end +end From eb5816bb7e36c601b701a60bc76550d952687539 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 16:49:14 +0800 Subject: [PATCH 047/104] added basic spec for #initialize --- spec/dom_tree_spec.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index a1dc765..36c9d96 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -3,7 +3,11 @@ require 'dom_tree' describe "DOMTree" do + let(:domtree) { DOMTree.new } + describe "#initialize" do - it "creates an instance of DOMTree" + it "creates an instance of DOMTree" do + expect(domtree).to be_a(DOMTree) + end end end From df188656f8e4183cd627876445e251df50e0155b Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 16:55:46 +0800 Subject: [PATCH 048/104] added specs/behavior for instance variables --- lib/dom_tree.rb | 6 +++++- spec/dom_tree_spec.rb | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/dom_tree.rb b/lib/dom_tree.rb index 37b31f9..c2dad1d 100644 --- a/lib/dom_tree.rb +++ b/lib/dom_tree.rb @@ -1,3 +1,7 @@ class DOMTree - def initialize; end + attr_reader :document + + def initialize(document = nil) + @document = document + end end diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index 36c9d96..38de7e2 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -10,4 +10,15 @@ expect(domtree).to be_a(DOMTree) end end + + context "instance variables" do + it "allows the instance variable @document to be read" do + test_domtree = DOMTree.new("document") + expect(test_domtree.document).to eq("document") + end + + it "raises a NoMethodError if you attempt to set @document after object instantiation" do + expect { domtree.document = "new document" }.to raise_error(NoMethodError) + end + end end From 7d4cf9721c49f86b89caeba2c96e67189418118f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 17:17:02 +0800 Subject: [PATCH 049/104] added specs/behavior for Node struct --- lib/dom_tree.rb | 5 +++++ spec/dom_tree_spec.rb | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/lib/dom_tree.rb b/lib/dom_tree.rb index c2dad1d..d17c617 100644 --- a/lib/dom_tree.rb +++ b/lib/dom_tree.rb @@ -1,7 +1,12 @@ +Node = Struct.new(:type, :content, :depth, :parent, :children) + class DOMTree attr_reader :document def initialize(document = nil) + # root node of tree @document = document end + + # methods for traversing tree to be added here end diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index 38de7e2..4432310 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -21,4 +21,44 @@ expect { domtree.document = "new document" }.to raise_error(NoMethodError) end end + + context "Node struct" do + let(:node) { Node.new(:open_tag, "

", 0, nil, []) } + + context "Node attributes" do + it "has a :type attribute" do + expect(node.type).to eq(:open_tag) + node.type = :close_tag + expect(node.type).to eq(:close_tag) + end + + it "has a :content attribute" do + expect(node.content).to eq("

") + node.content = "

" + expect(node.content).to eq("

") + end + + it "has a :depth attribute" do + expect(node.depth).to eq(0) + node.depth = 1 + expect(node.depth).to eq(1) + end + + it "has a :parent attribute" do + expect(node.parent).to be nil + node.parent = Node.new(:text, "foobar", 0, nil, []) + expect(node.parent.content).to eq("foobar") + end + + it "has a :children attribute" do + expect(node.children).to be_a(Array) + expect(node.children.length).to eq(0) + node.children << "child" + expect(node.children[0]).to eq("child") + expect(node.children.length).to eq(1) + end + end + end + + # tree method specs to be added here, e.g. for traversal end From 9f9bd1417f9b36f102c2a40d6d70ed01422fb10f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 18:00:46 +0800 Subject: [PATCH 050/104] changed #remove_doctype_tag method to #is_doctype_tag? method --- lib/dom_reader.rb | 10 +++++----- spec/dom_reader_spec.rb | 21 +++++++++++++-------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 49546b6..711ce6b 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -7,6 +7,11 @@ def read_file(filename) File.readlines(filename) end + def is_doctype_tag?(text) + regex = //i + !text.match(regex).nil? + end + def is_open_tag?(text) # regex = /^<(\w+)>$/ regex = /^<(\w+)\s*(.+)*>$/ @@ -41,9 +46,4 @@ def token_type(token) :unknown end - - def remove_doctype(text) - regex = /\s*?/i - text.gsub(regex, "") - end end diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index d97f17d..257a7ef 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -4,6 +4,7 @@ describe "DOMReader" do let(:dom_reader) { DOMReader.new } + let(:test_file) { './test.html' } describe "#initialize" do it "creates an instance of DOMReader" do @@ -17,11 +18,22 @@ end it "returns the file's contents as an array" do - test_file = './test.html' expect(dom_reader.read_file(test_file)).to be_a(Array) end end + describe "#is_doctype_tag?" do + it "returns true if the argument contains the doctype declaration" do + test_string = "\n\n" + expect(dom_reader.is_doctype_tag?(test_string)).to be true + end + + it "returns false otherwise" do + test_string = "some text" + expect(dom_reader.is_doctype_tag?(test_string)).to be false + end + end + describe "#is_open_tag?" do it "returns true if the argument is an opening tag" do expect(dom_reader.is_open_tag?("")).to be true @@ -78,13 +90,6 @@ end end - describe "#remove_doctype" do - it "removes the doctype declaration from the input text, if it exists" do - test_string = "\n\n" - expect(dom_reader.remove_doctype(test_string)).to eq "\n\n" - end - end - describe "#token_type" do it "returns :open_tag if the input is an opening tag" do token = "" From 94f3b0a267c87f7f5467be867bcbb6972ddffdfa Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 18:18:05 +0800 Subject: [PATCH 051/104] changed attr_reader to attr_accessor --- lib/dom_tree.rb | 2 +- spec/dom_tree_spec.rb | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/dom_tree.rb b/lib/dom_tree.rb index d17c617..d9900b8 100644 --- a/lib/dom_tree.rb +++ b/lib/dom_tree.rb @@ -1,7 +1,7 @@ Node = Struct.new(:type, :content, :depth, :parent, :children) class DOMTree - attr_reader :document + attr_accessor :document def initialize(document = nil) # root node of tree diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index 4432310..579ea0e 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -11,14 +11,16 @@ end end - context "instance variables" do + context "object attributes" do it "allows the instance variable @document to be read" do test_domtree = DOMTree.new("document") expect(test_domtree.document).to eq("document") end - it "raises a NoMethodError if you attempt to set @document after object instantiation" do - expect { domtree.document = "new document" }.to raise_error(NoMethodError) + it "allows the instance variable @document to be set" do + test_domtree = DOMTree.new("document") + test_domtree.document = "new document" + expect(test_domtree.document).to eq("new document") end end From c8e2d6dc14684cb4a5694f279ae2e2b48b48c333 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 18:18:47 +0800 Subject: [PATCH 052/104] added initial specs for #build_tree --- spec/dom_reader_spec.rb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 257a7ef..eb408fe 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -12,6 +12,14 @@ end end + describe "#build_tree" do + it "returns a tree of Nodes built from the input filename" do + tree = dom_reader.build_tree(test_file) + expect(tree).to be_a(DOMTree) + expect(tree.document).not_to be nil + end + end + describe "#read_file" do it "raises an error if the file to be read does not exist" do expect { dom_reader.read_file("blah") }.to raise_error(/file to be read does not exist/) From 137404c0b61f999b9904d6885c4680ad3a5c1e23 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Thu, 20 Apr 2017 18:19:08 +0800 Subject: [PATCH 053/104] added initial implementation for #build_tree --- lib/dom_reader.rb | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 711ce6b..4ce004f 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -1,3 +1,5 @@ +require_relative './dom_tree' + class DOMReader def initialize; end @@ -7,6 +9,32 @@ def read_file(filename) File.readlines(filename) end + def build_tree(filename) + # read in the file contents + lines = read_file(filename) + + # break file contents into tokens + tokens = [] + lines.each { |line| tokens << tokenize(line) } + tokens.flatten! + + # always good to check if our input is valid.. + unless tokens.empty? + + # remove doctype declaration, if present + tokens.delete_at(0) if is_doctype_tag?(tokens[0]) + + # TODO: remove this enumeration code + tokens.each { |token| p "#{token}, #{token_type(token)}" } + + # until tokens.empty? + # + # end + + end + end + + def is_doctype_tag?(text) regex = //i !text.match(regex).nil? From aabc442539035180cf59fd4e180b3df8e7a51f45 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 07:09:14 +0800 Subject: [PATCH 054/104] implemented #build_tree method --- lib/dom_reader.rb | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 4ce004f..18090ba 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -20,18 +20,46 @@ def build_tree(filename) # always good to check if our input is valid.. unless tokens.empty? - # remove doctype declaration, if present tokens.delete_at(0) if is_doctype_tag?(tokens[0]) - # TODO: remove this enumeration code - tokens.each { |token| p "#{token}, #{token_type(token)}" } - - # until tokens.empty? - # - # end - + # initialize the tree + tree = DOMTree.new + current_node = tree.document + + # set current depth of the tree + current_depth = 0 + + until tokens.empty? + token = tokens.shift + token_type = token_type(token) + + # create tree's root node if it does not exist yet + if tree.document.nil? + tree.document = Node.new(token_type, token, current_depth, nil, [] ) + current_node = tree.document + else + # root node already exists, create new node + case token_type + when :open_tag + node = Node.new(token_type, token, current_depth, current_node, []) + current_node.children << node + current_node = node + current_depth += 1 + when :text + node = Node.new(token_type, token, current_depth, current_node, []) + current_node.children << node + when :close_tag + current_node = current_node.parent + current_depth -= 1 + when :unknown + # ignore unknown tag + end + end + end end + + tree end From 88e82055a3b8620a9e73148ca685c3d90d9256a9 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 09:21:16 +0800 Subject: [PATCH 055/104] modified specs for #read_file to use File stub --- spec/dom_reader_spec.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index eb408fe..5360888 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -26,7 +26,13 @@ end it "returns the file's contents as an array" do - expect(dom_reader.read_file(test_file)).to be_a(Array) + filename = './test.html' + contents = ["\n", "\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", "\n"] + + allow(File).to receive(:readlines).with(filename).and_return(contents) + + expect(dom_reader.read_file(filename)).to be_a(Array) + expect(dom_reader.read_file(filename)).to eq(contents) end end From 9f01c2703e1d2bdd178642eb130d554c69719ca6 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 09:24:55 +0800 Subject: [PATCH 056/104] modified spec for #build_tree to use File stub --- spec/dom_reader_spec.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 5360888..ad913a9 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -4,7 +4,6 @@ describe "DOMReader" do let(:dom_reader) { DOMReader.new } - let(:test_file) { './test.html' } describe "#initialize" do it "creates an instance of DOMReader" do @@ -14,7 +13,12 @@ describe "#build_tree" do it "returns a tree of Nodes built from the input filename" do - tree = dom_reader.build_tree(test_file) + filename = './test.html' + contents = ["\n", "\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", "\n"] + + allow(File).to receive(:readlines).with(filename).and_return(contents) + + tree = dom_reader.build_tree(filename) expect(tree).to be_a(DOMTree) expect(tree.document).not_to be nil end From fdaeb895d98a137581eda27c4cb4646b514e11ad Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 09:49:57 +0800 Subject: [PATCH 057/104] modified spec for #build_tree --- spec/dom_reader_spec.rb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index ad913a9..bf3e5a0 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -14,13 +14,16 @@ describe "#build_tree" do it "returns a tree of Nodes built from the input filename" do filename = './test.html' - contents = ["\n", "\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", "\n"] + # contents = ["\n", "\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", "\n"] + + contents = ["\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", ""] allow(File).to receive(:readlines).with(filename).and_return(contents) tree = dom_reader.build_tree(filename) expect(tree).to be_a(DOMTree) - expect(tree.document).not_to be nil + expect(tree.document).to be_a(Node) + expect(tree.document.type).to eq(:open_tag) end end From dc3d9f0a2ae93ebb8a49656f0e7990bc651b2c2f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 09:50:25 +0800 Subject: [PATCH 058/104] fixed bug in #build_tree with traversing up to parent node --- lib/dom_reader.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 18090ba..ca6cd16 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -36,7 +36,7 @@ def build_tree(filename) # create tree's root node if it does not exist yet if tree.document.nil? - tree.document = Node.new(token_type, token, current_depth, nil, [] ) + tree.document = Node.new(token_type, token, current_depth, nil, []) current_node = tree.document else # root node already exists, create new node @@ -50,7 +50,7 @@ def build_tree(filename) node = Node.new(token_type, token, current_depth, current_node, []) current_node.children << node when :close_tag - current_node = current_node.parent + current_node = current_node.parent unless current_node == tree.document current_depth -= 1 when :unknown # ignore unknown tag From 3faac9293c2c87b5db8fa0d71652d83ac8e4d403 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 09:57:58 +0800 Subject: [PATCH 059/104] reordered methods --- lib/dom_reader.rb | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index ca6cd16..a7679cb 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -62,6 +62,24 @@ def build_tree(filename) tree end + def tokenize(text) + regex = /(<.+?>|[^<>]+|<\/\w+?>)/ + tokens = text.scan(regex).flatten + + tokens + end + + def token_type(token) + return :unknown unless token.is_a?(String) + + return :open_tag if is_open_tag?(token) + + return :close_tag if is_close_tag?(token) + + return :text if is_text?(token) + + :unknown + end def is_doctype_tag?(text) regex = //i @@ -83,23 +101,4 @@ def is_text?(text) regex = /^[^<>\/]+$/ !text.match(regex).nil? end - - def tokenize(text) - regex = /(<.+?>|[^<>]+|<\/\w+?>)/ - tokens = text.scan(regex).flatten - - tokens - end - - def token_type(token) - return :unknown unless token.is_a?(String) - - return :open_tag if is_open_tag?(token) - - return :close_tag if is_close_tag?(token) - - return :text if is_text?(token) - - :unknown - end end From e567501e402eda7a14c754b5bc21d0a80dc47e67 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:04:02 +0800 Subject: [PATCH 060/104] added specs/implementation for #tokenize_file method --- lib/dom_reader.rb | 19 +++++++++++++------ spec/dom_reader_spec.rb | 13 +++++++++++++ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index a7679cb..d00e992 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -3,12 +3,6 @@ class DOMReader def initialize; end - def read_file(filename) - raise "The file to be read does not exist." unless File.exist?(filename) - - File.readlines(filename) - end - def build_tree(filename) # read in the file contents lines = read_file(filename) @@ -62,6 +56,19 @@ def build_tree(filename) tree end + def read_file(filename) + raise "The file to be read does not exist." unless File.exist?(filename) + + File.readlines(filename) + end + + def tokenize_file(filename) + lines = read_file(filename) + tokens = [] + lines.each { |line| tokens << tokenize(line) } + tokens.flatten! + end + def tokenize(text) regex = /(<.+?>|[^<>]+|<\/\w+?>)/ tokens = text.scan(regex).flatten diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index bf3e5a0..1726727 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -43,6 +43,19 @@ end end + describe "#tokenize_file" do + it "returns the file's contents as an array of tokens" do + filename = './test.html' + contents = ["\n", "\n", "\n"] + + allow(File).to receive(:readlines).with(filename).and_return(contents) + + expected_result = ["", "\n", "", "\n", "", "\n"] + + expect(dom_reader.tokenize_file(filename)).to eq(expected_result) + end + end + describe "#is_doctype_tag?" do it "returns true if the argument contains the doctype declaration" do test_string = "\n\n" From c64ba8eca144c385046dc4c10f355abcb4892807 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:07:04 +0800 Subject: [PATCH 061/104] refactored #build_tree to use #tokenize_file --- lib/dom_reader.rb | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index d00e992..afaba8d 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -4,13 +4,8 @@ class DOMReader def initialize; end def build_tree(filename) - # read in the file contents - lines = read_file(filename) - - # break file contents into tokens - tokens = [] - lines.each { |line| tokens << tokenize(line) } - tokens.flatten! + # break file into tokens + tokens = tokenize_file(filename) # always good to check if our input is valid.. unless tokens.empty? From 436ce9f8a7e3f01edcb138d17659e5d7474cdac9 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:08:32 +0800 Subject: [PATCH 062/104] change specs order --- spec/dom_reader_spec.rb | 96 ++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 1726727..9b2de41 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -56,54 +56,6 @@ end end - describe "#is_doctype_tag?" do - it "returns true if the argument contains the doctype declaration" do - test_string = "\n\n" - expect(dom_reader.is_doctype_tag?(test_string)).to be true - end - - it "returns false otherwise" do - test_string = "some text" - expect(dom_reader.is_doctype_tag?(test_string)).to be false - end - end - - describe "#is_open_tag?" do - it "returns true if the argument is an opening tag" do - expect(dom_reader.is_open_tag?("")).to be true - expect(dom_reader.is_open_tag?("")).to be true - expect(dom_reader.is_open_tag?("")).to be true - end - - it "returns false otherwise" do - expect(dom_reader.is_open_tag?("text")).to be false - expect(dom_reader.is_open_tag?("")).to be false - end - end - - describe "#is_close_tag?" do - it "returns true if the argument is a closing tag" do - expect(dom_reader.is_close_tag?("")).to be true - end - - it "returns false otherwise" do - expect(dom_reader.is_close_tag?("")).to be false - expect(dom_reader.is_close_tag?("text")).to be false - end - end - - describe "#is_text?" do - it "returns true if the argument is just text (not a tag)" do - expect(dom_reader.is_text?("text")).to be true - expect(dom_reader.is_text?("multiple words here!!!")).to be true - end - - it "returns false otherwise" do - expect(dom_reader.is_text?("")).to be false - expect(dom_reader.is_text?("")).to be false - end - end - describe "#tokenize" do it "breaks down the input string into tokens" do test_string = "This document contains data\n" @@ -154,4 +106,52 @@ expect(dom_reader.token_type(token)).to eq(:unknown) end end + + describe "#is_doctype_tag?" do + it "returns true if the argument contains the doctype declaration" do + test_string = "\n\n" + expect(dom_reader.is_doctype_tag?(test_string)).to be true + end + + it "returns false otherwise" do + test_string = "some text" + expect(dom_reader.is_doctype_tag?(test_string)).to be false + end + end + + describe "#is_open_tag?" do + it "returns true if the argument is an opening tag" do + expect(dom_reader.is_open_tag?("")).to be true + expect(dom_reader.is_open_tag?("")).to be true + expect(dom_reader.is_open_tag?("")).to be true + end + + it "returns false otherwise" do + expect(dom_reader.is_open_tag?("text")).to be false + expect(dom_reader.is_open_tag?("")).to be false + end + end + + describe "#is_close_tag?" do + it "returns true if the argument is a closing tag" do + expect(dom_reader.is_close_tag?("")).to be true + end + + it "returns false otherwise" do + expect(dom_reader.is_close_tag?("")).to be false + expect(dom_reader.is_close_tag?("text")).to be false + end + end + + describe "#is_text?" do + it "returns true if the argument is just text (not a tag)" do + expect(dom_reader.is_text?("text")).to be true + expect(dom_reader.is_text?("multiple words here!!!")).to be true + end + + it "returns false otherwise" do + expect(dom_reader.is_text?("")).to be false + expect(dom_reader.is_text?("")).to be false + end + end end From 0bf9b88a412a3025751ffae0acb7b151f2918110 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:12:23 +0800 Subject: [PATCH 063/104] added specs/behavior for #remove_doctype method --- lib/dom_reader.rb | 5 +++++ spec/dom_reader_spec.rb | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index afaba8d..713da05 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -71,6 +71,11 @@ def tokenize(text) tokens end + def remove_doctype(tokens) + tokens.delete_at(0) if is_doctype_tag?(tokens[0]) + tokens + end + def token_type(token) return :unknown unless token.is_a?(String) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 9b2de41..12a59be 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -76,6 +76,15 @@ end end + describe "#remove_doctype" do + it "removes any doctype declaration tokens from the provided list of tokens" do + tokens = ["", "\n", "", "\n", "", "\n"] + expected_result = ["\n", "", "\n", "", "\n"] + + expect(dom_reader.remove_doctype(tokens)).to eq(expected_result) + end + end + describe "#token_type" do it "returns :open_tag if the input is an opening tag" do token = "" From 18c29dc83f227eb04df2b5719214ff081bd2ecfb Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:17:21 +0800 Subject: [PATCH 064/104] refactored #build_tree method --- lib/dom_reader.rb | 70 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 713da05..143c240 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -7,44 +7,44 @@ def build_tree(filename) # break file into tokens tokens = tokenize_file(filename) - # always good to check if our input is valid.. - unless tokens.empty? - # remove doctype declaration, if present - tokens.delete_at(0) if is_doctype_tag?(tokens[0]) + # exit if we have no tokens + return nil if tokens.empty? - # initialize the tree - tree = DOMTree.new + tokens = remove_doctype(tokens) + + # initialize the tree + tree = DOMTree.new + current_node = tree.document + + # set current depth of the tree + current_depth = 0 + + # create tree's root node if it does not exist yet + if tree.document.nil? + token = tokens.shift + tree.document = Node.new(token_type(token), token, current_depth, nil, []) current_node = tree.document + end - # set current depth of the tree - current_depth = 0 - - until tokens.empty? - token = tokens.shift - token_type = token_type(token) - - # create tree's root node if it does not exist yet - if tree.document.nil? - tree.document = Node.new(token_type, token, current_depth, nil, []) - current_node = tree.document - else - # root node already exists, create new node - case token_type - when :open_tag - node = Node.new(token_type, token, current_depth, current_node, []) - current_node.children << node - current_node = node - current_depth += 1 - when :text - node = Node.new(token_type, token, current_depth, current_node, []) - current_node.children << node - when :close_tag - current_node = current_node.parent unless current_node == tree.document - current_depth -= 1 - when :unknown - # ignore unknown tag - end - end + # process remaining tokens + until tokens.empty? + token = tokens.shift + token_type = token_type(token) + + case token_type + when :open_tag + node = Node.new(token_type, token, current_depth, current_node, []) + current_node.children << node + current_node = node + current_depth += 1 + when :text + node = Node.new(token_type, token, current_depth, current_node, []) + current_node.children << node + when :close_tag + current_node = current_node.parent unless current_node == tree.document + current_depth -= 1 + when :unknown + # ignore unknown tag for now end end From 617ece393f5f13afa0051c19f98992c95f7ebbd2 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:19:38 +0800 Subject: [PATCH 065/104] refactored spec for #build_tree --- spec/dom_reader_spec.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spec/dom_reader_spec.rb b/spec/dom_reader_spec.rb index 12a59be..0130947 100644 --- a/spec/dom_reader_spec.rb +++ b/spec/dom_reader_spec.rb @@ -14,16 +14,16 @@ describe "#build_tree" do it "returns a tree of Nodes built from the input filename" do filename = './test.html' - # contents = ["\n", "\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", "\n"] - - contents = ["\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", ""] + contents = ["\n", "\n", "\n", "Title\n", "\n", "\n", "

text

\n", "\n", "\n"] allow(File).to receive(:readlines).with(filename).and_return(contents) tree = dom_reader.build_tree(filename) expect(tree).to be_a(DOMTree) expect(tree.document).to be_a(Node) - expect(tree.document.type).to eq(:open_tag) + expect(tree.document.type).to eq(:text) + expect(tree.document.children.first.type).to eq(:open_tag) + expect(tree.document.children.first.content).to eq("") end end From 4845f69a59f90e8d2ea1bf4204a8c466227dd2a7 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:23:23 +0800 Subject: [PATCH 066/104] added example Ruby file to run program --- example.rb | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/example.rb b/example.rb index dd79ee8..3e94d41 100644 --- a/example.rb +++ b/example.rb @@ -2,16 +2,19 @@ require_relative './lib/dom_reader' dom_reader = DOMReader.new - contents = dom_reader.read_file("./test.html") - contents[0] = dom_reader.remove_doctype(contents[0]) - # looks like tokens are being split out correctly - # contiguous whitespace / newlines are being parsed as tokens too + # contents = dom_reader.read_file("./test.html") + # contents[0] = dom_reader.remove_doctype(contents[0]) + # + # # looks like tokens are being split out correctly + # # contiguous whitespace / newlines are being parsed as tokens too + # + # contents.each do |line| + # tokens = dom_reader.tokenize(line) + # tokens.each do |token| + # p "token: #{token}, type: #{dom_reader.token_type(token)}" + # end + # end - contents.each do |line| - tokens = dom_reader.tokenize(line) - tokens.each do |token| - p "token: #{token}, type: #{dom_reader.token_type(token)}" - end - end + p dom_reader.build_tree('./test.html') end From f788f04d367eb948734aabe0ccab8144db964b7a Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:44:12 +0800 Subject: [PATCH 067/104] initial specs definition for NodeRenderer class --- spec/node_renderer_spec.rb | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 spec/node_renderer_spec.rb diff --git a/spec/node_renderer_spec.rb b/spec/node_renderer_spec.rb new file mode 100644 index 0000000..99b424b --- /dev/null +++ b/spec/node_renderer_spec.rb @@ -0,0 +1,19 @@ +# spec/node_renderer_spec.rb + +require 'node_renderer' + +describe "NodeRenderer" do + describe "#initialize" do + it "creates an instance of NodeRenderer" + end + + describe "#render" do + it "displays the total number of nodes in the sub-tree below the input node" + + it "displays a count of each node type in the sub-tree below the input node" + + it "displays all the input node's data attributes" + + it "displays all the statistics above for the root node of the tree, if the input node is nil" + end +end From a1357ef0ec67960ff1150d4a111bf33487b0d108 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 10:44:47 +0800 Subject: [PATCH 068/104] initial NodeRenderer class definition --- lib/node_renderer.rb | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 lib/node_renderer.rb diff --git a/lib/node_renderer.rb b/lib/node_renderer.rb new file mode 100644 index 0000000..e69de29 From d02616231e5e01df61232dde87947f6fd109250a Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 11:08:25 +0800 Subject: [PATCH 069/104] added specs/behavior for #initialize and instance variable --- lib/node_renderer.rb | 7 +++++++ spec/node_renderer_spec.rb | 15 ++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/lib/node_renderer.rb b/lib/node_renderer.rb index e69de29..02e7c57 100644 --- a/lib/node_renderer.rb +++ b/lib/node_renderer.rb @@ -0,0 +1,7 @@ +class NodeRenderer + attr_reader :tree + + def initialize(tree = nil) + @tree = tree + end +end diff --git a/spec/node_renderer_spec.rb b/spec/node_renderer_spec.rb index 99b424b..9491282 100644 --- a/spec/node_renderer_spec.rb +++ b/spec/node_renderer_spec.rb @@ -3,8 +3,21 @@ require 'node_renderer' describe "NodeRenderer" do + let(:node_renderer) { NodeRenderer.new } + describe "#initialize" do - it "creates an instance of NodeRenderer" + it "creates an instance of NodeRenderer" do + expect(node_renderer).to be_a(NodeRenderer) + end + end + + context "instance variables" do + describe "#tree" do + it "returns the value of the instance variable @tree" do + test_node_renderer = NodeRenderer.new("tree") + expect(test_node_renderer.tree).to eq("tree") + end + end end describe "#render" do From 51fc7f461dd66645f8a703e6e2a38408da4b0cfd Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 11:25:40 +0800 Subject: [PATCH 070/104] added #render and #display_data_attributes methods --- lib/node_renderer.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lib/node_renderer.rb b/lib/node_renderer.rb index 02e7c57..2c1c66b 100644 --- a/lib/node_renderer.rb +++ b/lib/node_renderer.rb @@ -4,4 +4,18 @@ class NodeRenderer def initialize(tree = nil) @tree = tree end + + def render(node = nil) + display_data_attributes(node) + end + + def display_data_attributes(node = nil) + node = self.tree.document if node.nil? + + puts "=== Node data attributes ===" + puts "Node type: #{node.type}" + puts "Node content: #{node.content.inspect}" + puts "Node depth: #{node.depth}" + puts "Children: #{node.children.length}\n\n" + end end From 63c805f5090decc12ebcdfe12fa7f6a9953ce7f1 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 11:26:35 +0800 Subject: [PATCH 071/104] modified spec for #render --- spec/node_renderer_spec.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spec/node_renderer_spec.rb b/spec/node_renderer_spec.rb index 9491282..4a66e46 100644 --- a/spec/node_renderer_spec.rb +++ b/spec/node_renderer_spec.rb @@ -25,7 +25,10 @@ it "displays a count of each node type in the sub-tree below the input node" - it "displays all the input node's data attributes" + it "displays all the input node's data attributes" do + expect(node_renderer).to receive(:display_data_attributes) + node_renderer.render + end it "displays all the statistics above for the root node of the tree, if the input node is nil" end From 969fa4e572247a93628aecb84a2c006a9d4779f9 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 11:26:48 +0800 Subject: [PATCH 072/104] modified test code --- example.rb | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/example.rb b/example.rb index 3e94d41..05da261 100644 --- a/example.rb +++ b/example.rb @@ -1,20 +1,10 @@ if $0 == __FILE__ require_relative './lib/dom_reader' + require_relative './lib/node_renderer' dom_reader = DOMReader.new - # contents = dom_reader.read_file("./test.html") - # contents[0] = dom_reader.remove_doctype(contents[0]) - # - # # looks like tokens are being split out correctly - # # contiguous whitespace / newlines are being parsed as tokens too - # - # contents.each do |line| - # tokens = dom_reader.tokenize(line) - # tokens.each do |token| - # p "token: #{token}, type: #{dom_reader.token_type(token)}" - # end - # end - - p dom_reader.build_tree('./test.html') + tree = dom_reader.build_tree('./test.html') + node_renderer = NodeRenderer.new(tree) + node_renderer.render end From ab6e0b622e566e1094ac2e4a25ffd1bd22717328 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sat, 22 Apr 2017 11:36:19 +0800 Subject: [PATCH 073/104] refactored #build_tree method --- lib/dom_reader.rb | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/dom_reader.rb b/lib/dom_reader.rb index 143c240..f6aa9b6 100644 --- a/lib/dom_reader.rb +++ b/lib/dom_reader.rb @@ -10,6 +10,7 @@ def build_tree(filename) # exit if we have no tokens return nil if tokens.empty? + # remove doctype declaration, if any tokens = remove_doctype(tokens) # initialize the tree @@ -31,20 +32,21 @@ def build_tree(filename) token = tokens.shift token_type = token_type(token) + node = Node.new(token_type, token, current_depth, current_node, []) + current_node.children << node + case token_type when :open_tag - node = Node.new(token_type, token, current_depth, current_node, []) - current_node.children << node current_node = node current_depth += 1 when :text - node = Node.new(token_type, token, current_depth, current_node, []) - current_node.children << node + next when :close_tag current_node = current_node.parent unless current_node == tree.document - current_depth -= 1 + current_depth -= 1 unless current_depth == 0 when :unknown # ignore unknown tag for now + next end end From 3f620235b3ff0476209c75ffd0d053ada903894e Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 11:06:25 +0800 Subject: [PATCH 074/104] added specs/behavior for #num_nodes_below method --- lib/dom_tree.rb | 16 ++++++++++++++++ spec/dom_tree_spec.rb | 13 +++++++++++++ 2 files changed, 29 insertions(+) diff --git a/lib/dom_tree.rb b/lib/dom_tree.rb index d9900b8..a880a2e 100644 --- a/lib/dom_tree.rb +++ b/lib/dom_tree.rb @@ -9,4 +9,20 @@ def initialize(document = nil) end # methods for traversing tree to be added here + + def num_nodes_below(node = nil) + return 0 if node.nil? || node.children.empty? + + count = 0 + queue = [] + node.children.each { |child| queue << child } + + until queue.empty? + current_node = queue.shift + count += 1 + current_node.children.each { |child| queue << child unless child.nil? } + end + + count + end end diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index 579ea0e..6750a03 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -24,6 +24,19 @@ end end + describe "#num_nodes_below" do + it "returns the number of nodes in the sub-tree below the specified node" do + node1 = Node.new(:open_tag, "", 0, nil, []) + node2 = Node.new(:text, "text", 0, node1, []) + node3 = Node.new(:close_tag, "", 0, node1, []) + + node1.children << node2 + node1.children << node3 + + expect(domtree.num_nodes_below(node1)).to eq(2) + end + end + context "Node struct" do let(:node) { Node.new(:open_tag, "

", 0, nil, []) } From 94ee191e1c90e59f6df193f1cf21262a2edbad85 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 12:32:37 +0800 Subject: [PATCH 075/104] modified specs for #num_nodes_below method --- spec/dom_tree_spec.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index 6750a03..80bd7f0 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -31,9 +31,11 @@ node3 = Node.new(:close_tag, "", 0, node1, []) node1.children << node2 - node1.children << node3 + node2.children << node3 expect(domtree.num_nodes_below(node1)).to eq(2) + expect(domtree.num_nodes_below(node2)).to eq(1) + expect(domtree.num_nodes_below(node3)).to eq(0) end end From 15c360c9f8586bef5fed04f5cca32306072236dd Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 13:26:54 +0800 Subject: [PATCH 076/104] implemented specs/behaviors for #node_types_below below --- lib/dom_tree.rb | 22 ++++++++++++++++++++++ spec/dom_tree_spec.rb | 19 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/lib/dom_tree.rb b/lib/dom_tree.rb index a880a2e..4e374cc 100644 --- a/lib/dom_tree.rb +++ b/lib/dom_tree.rb @@ -25,4 +25,26 @@ def num_nodes_below(node = nil) count end + + def node_types_below(node = nil) + return {} if node.nil? || node.children.empty? + + node_types = {} + + queue = [] + node.children.each { |child| queue << child } + + until queue.empty? + current_node = queue.shift + current_node.children.each { |child| queue << child unless child.nil? } + + if node_types.keys.include?(current_node.type) + node_types[current_node.type] += 1 + else + node_types[current_node.type] = 1 + end + end + + node_types + end end diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index 80bd7f0..633b007 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -39,6 +39,25 @@ end end + describe "#node_types_below" do + it "returns the types and quantities of nodes in the sub-tree below the specified node" do + node1 = Node.new(:open_tag, "", 0, nil, []) + node2 = Node.new(:text, "text", 0, node1, []) + node3 = Node.new(:close_tag, "", 0, node1, []) + + node1.children << node2 + node2.children << node3 + node_types = domtree.node_types_below(node1) + + expect(node_types).to be_a(Hash) + expect(node_types.keys.length).to eq(2) + expect(node_types.keys).to include(:text) + expect(node_types.keys).to include(:close_tag) + expect(node_types[:text]).to eq(1) + expect(node_types[:close_tag]).to eq(1) + end + end + context "Node struct" do let(:node) { Node.new(:open_tag, "

", 0, nil, []) } From aeb74776bdd14b492180643d3a9e60d307e2b1a0 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 13:27:18 +0800 Subject: [PATCH 077/104] removed superfluous comment --- spec/dom_tree_spec.rb | 2 -- 1 file changed, 2 deletions(-) diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index 633b007..29df30b 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -95,6 +95,4 @@ end end end - - # tree method specs to be added here, e.g. for traversal end From f9aca6c51ecc14a6b19f57f16cd77c6db5a4d57c Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 13:27:45 +0800 Subject: [PATCH 078/104] renamed context --- spec/dom_tree_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/dom_tree_spec.rb b/spec/dom_tree_spec.rb index 29df30b..aeb31e1 100644 --- a/spec/dom_tree_spec.rb +++ b/spec/dom_tree_spec.rb @@ -11,7 +11,7 @@ end end - context "object attributes" do + context "instance variables" do it "allows the instance variable @document to be read" do test_domtree = DOMTree.new("document") expect(test_domtree.document).to eq("document") From 61d8ee2489316bffb7bf950f3a144114ad57fc72 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 13:39:24 +0800 Subject: [PATCH 079/104] modified spec for #render --- spec/node_renderer_spec.rb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spec/node_renderer_spec.rb b/spec/node_renderer_spec.rb index 4a66e46..9491282 100644 --- a/spec/node_renderer_spec.rb +++ b/spec/node_renderer_spec.rb @@ -25,10 +25,7 @@ it "displays a count of each node type in the sub-tree below the input node" - it "displays all the input node's data attributes" do - expect(node_renderer).to receive(:display_data_attributes) - node_renderer.render - end + it "displays all the input node's data attributes" it "displays all the statistics above for the root node of the tree, if the input node is nil" end From 06e615d54d1152dba55c3b2c66fcb7c05aec8a8f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 13:40:02 +0800 Subject: [PATCH 080/104] added #display_num_nodes_below method --- lib/node_renderer.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/node_renderer.rb b/lib/node_renderer.rb index 2c1c66b..579a515 100644 --- a/lib/node_renderer.rb +++ b/lib/node_renderer.rb @@ -7,6 +7,7 @@ def initialize(tree = nil) def render(node = nil) display_data_attributes(node) + display_num_nodes_below(node) end def display_data_attributes(node = nil) @@ -18,4 +19,10 @@ def display_data_attributes(node = nil) puts "Node depth: #{node.depth}" puts "Children: #{node.children.length}\n\n" end + + def display_num_nodes_below(node = nil) + node = self.tree.document if node.nil? + + puts "Number of child nodes: #{self.tree.num_nodes_below(node)}\n\n" + end end From 8b44ded9bb87a45fea0f5fe08446b0e0c1277d16 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 13:45:24 +0800 Subject: [PATCH 081/104] added #display_node_types_below method --- lib/node_renderer.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/node_renderer.rb b/lib/node_renderer.rb index 579a515..2efe856 100644 --- a/lib/node_renderer.rb +++ b/lib/node_renderer.rb @@ -8,6 +8,7 @@ def initialize(tree = nil) def render(node = nil) display_data_attributes(node) display_num_nodes_below(node) + display_node_types_below(node) end def display_data_attributes(node = nil) @@ -25,4 +26,12 @@ def display_num_nodes_below(node = nil) puts "Number of child nodes: #{self.tree.num_nodes_below(node)}\n\n" end + + def display_node_types_below(node = nil) + node = self.tree.document if node.nil? + node_types = self.tree.node_types_below(node) + + puts "=== Node types in subtree ===" + puts node_types + end end From e71af9de4d9b104eeb8adbe3c75b2576fac2925d Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 13:48:09 +0800 Subject: [PATCH 082/104] modified output of display methods --- lib/node_renderer.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/node_renderer.rb b/lib/node_renderer.rb index 2efe856..6492c0a 100644 --- a/lib/node_renderer.rb +++ b/lib/node_renderer.rb @@ -24,14 +24,15 @@ def display_data_attributes(node = nil) def display_num_nodes_below(node = nil) node = self.tree.document if node.nil? - puts "Number of child nodes: #{self.tree.num_nodes_below(node)}\n\n" + puts "=== Number of nodes in subtree(s) below ===" + puts "#{self.tree.num_nodes_below(node)}\n\n" end def display_node_types_below(node = nil) node = self.tree.document if node.nil? node_types = self.tree.node_types_below(node) - puts "=== Node types in subtree ===" + puts "=== Node types in subtree(s) below ===" puts node_types end end From 8ec000eea0946a920f7ef840c734eb861f51fd3b Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 14:20:38 +0800 Subject: [PATCH 083/104] reworked specs for #render --- spec/node_renderer_spec.rb | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/spec/node_renderer_spec.rb b/spec/node_renderer_spec.rb index 9491282..ebf0d70 100644 --- a/spec/node_renderer_spec.rb +++ b/spec/node_renderer_spec.rb @@ -1,13 +1,12 @@ # spec/node_renderer_spec.rb +require 'dom_tree' require 'node_renderer' describe "NodeRenderer" do - let(:node_renderer) { NodeRenderer.new } - describe "#initialize" do it "creates an instance of NodeRenderer" do - expect(node_renderer).to be_a(NodeRenderer) + expect(NodeRenderer.new).to be_a(NodeRenderer) end end @@ -21,12 +20,27 @@ end describe "#render" do - it "displays the total number of nodes in the sub-tree below the input node" + node1 = Node.new(:open_tag, "", 0, nil, []) + node2 = Node.new(:text, "text", 0, node1, []) + node3 = Node.new(:close_tag, "", 0, node1, []) + tree = DOMTree.new(node1) + node_renderer = NodeRenderer.new(tree) - it "displays a count of each node type in the sub-tree below the input node" + before(:each) { allow(node_renderer).to receive(:puts).and_return(nil) } - it "displays all the input node's data attributes" + it "displays all the input node's data attributes" do + expect(node_renderer).to receive(:display_data_attributes) + node_renderer.render + end - it "displays all the statistics above for the root node of the tree, if the input node is nil" + it "displays the total number of nodes in the sub-tree below the input node" do + expect(node_renderer).to receive(:display_num_nodes_below) + node_renderer.render + end + + it "displays a count of each node type in the sub-tree below the input node" do + expect(node_renderer).to receive(:display_node_types_below) + node_renderer.render + end end end From a6d839cc288300c4848e62381ccd5bc41d358f24 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 14:46:56 +0800 Subject: [PATCH 084/104] initial specs/class definitions for TreeSearcher class --- lib/tree_searcher.rb | 7 +++++++ spec/tree_searcher_spec.rb | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 lib/tree_searcher.rb create mode 100644 spec/tree_searcher_spec.rb diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb new file mode 100644 index 0000000..ccea49f --- /dev/null +++ b/lib/tree_searcher.rb @@ -0,0 +1,7 @@ +class TreeSearcher + attr_reader :tree + + def initialize(tree = nil) + @tree = tree + end +end diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb new file mode 100644 index 0000000..2e7a937 --- /dev/null +++ b/spec/tree_searcher_spec.rb @@ -0,0 +1,20 @@ +# spec/tree_searcher_spec.rb + +require 'tree_searcher' + +describe "TreeSearcher" do + describe "#initialize" do + it "creates an instance of TreeSearcher" do + expect(TreeSearcher.new).to be_a(TreeSearcher) + end + end + + context "instance variables" do + describe "#tree" do + it "returns the value of the instance variable @tree" do + test_tree_searcher = TreeSearcher.new("tree") + expect(test_tree_searcher.tree).to eq("tree") + end + end + end +end From c6ab3a3ef9a5038eabe6bee585135c462beb3a8f Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 14:57:04 +0800 Subject: [PATCH 085/104] defined more specs for TreeSearcher class --- spec/tree_searcher_spec.rb | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 2e7a937..ae0395b 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -17,4 +17,22 @@ end end end + + describe "#search_by" do + it "returns a collection of all nodes that match the name of the tag to search by" + + it "returns a collection of all nodes that match the text of the tag to search by" + + it "returns a collection of all nodes that match the id of the tag to search by" + + it "returns a collection of all nodes that match the id of the tag to search by" + end + + describe "search_descendents" do + it "returns the same results as #search_by, but only includes the descendents of the input node" + end + + describe "search_ancestors" do + it "returns the same results as #search_by, but only includes the ancestors of the input node" + end end From 17ec369d2d0fc034b8f27e267003af9676a2857a Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 15:58:23 +0800 Subject: [PATCH 086/104] added spec for #search_by --- spec/tree_searcher_spec.rb | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index ae0395b..912d107 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -1,5 +1,6 @@ # spec/tree_searcher_spec.rb +require 'dom_tree' require 'tree_searcher' describe "TreeSearcher" do @@ -19,7 +20,19 @@ end describe "#search_by" do - it "returns a collection of all nodes that match the name of the tag to search by" + node1 = Node.new(:open_tag, "

", 0, nil, []) + node2 = Node.new(:text, "Some text right here!", 0, node1, []) + node3 = Node.new(:close_tag, "
", 0, node1, []) + tree = DOMTree.new(node1) + tree_searcher = TreeSearcher.new(tree) + + it "returns a collection of all nodes that match the name of the tag to search by" do + results = tree_searcher.search_by(:name, 'div') + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end it "returns a collection of all nodes that match the text of the tag to search by" From 7f1cd956559a45994f79f5d4f2a8282402cb65d3 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 15:58:50 +0800 Subject: [PATCH 087/104] added initial implementation for #search_by with private helper method --- lib/tree_searcher.rb | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index ccea49f..43d661b 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -4,4 +4,43 @@ class TreeSearcher def initialize(tree = nil) @tree = tree end + + def search_by(attribute, value) + case attribute + when :name + regex = /^<#{value}.*>$/ + node_type = value.start_with?("/") ? :close_tag : :open_tag + when :text + + when :id + + when :class + + else + puts "Invalid attribute to search for." + return nil + end + + find_nodes_by(node_type, regex, value) + end + + private + + def find_nodes_by(node_type, regex, value) + # perform BFS on tree to find matching nodes + collection = [] + queue = [] + + queue << self.tree.document + until queue.empty? + current_node = queue.shift + next unless current_node.type == node_type + + collection << current_node if current_node.content.match(regex) + + current_node.children.each { |child| queue << child unless child.nil? } + end + + collection + end end From f01054af15e621e60dc9e7f59c0576b20b7da7f5 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 19:02:31 +0800 Subject: [PATCH 088/104] modified specs for #search_by --- spec/tree_searcher_spec.rb | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 912d107..9b397e1 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -20,25 +20,43 @@ end describe "#search_by" do + # setup nodes node1 = Node.new(:open_tag, "
", 0, nil, []) node2 = Node.new(:text, "Some text right here!", 0, node1, []) node3 = Node.new(:close_tag, "
", 0, node1, []) + + node4 = Node.new(:open_tag, "

", 1, node1, []) + node5 = Node.new(:text, "Some p text right here!", 1, node4, []) + node6 = Node.new(:close_tag, "

", 1, node4, []) + + node1.children << node2 + node1.children << node4 + node1.children << node3 + + node4.children << node5 + node4.children << node6 + + # setup tree and TreeSearcher instance tree = DOMTree.new(node1) tree_searcher = TreeSearcher.new(tree) - it "returns a collection of all nodes that match the name of the tag to search by" do - results = tree_searcher.search_by(:name, 'div') + context "search for nodes by their tag name" do + it "returns the correct nodes when searching for an opening tag" do + results = tree_searcher.search_by(:name, 'div') - expect(results).to be_a(Array) - expect(results.length).to eq(1) - expect(results.first).to eq(node1) - end - - it "returns a collection of all nodes that match the text of the tag to search by" + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end - it "returns a collection of all nodes that match the id of the tag to search by" + it "returns the correct nodes when searching for a closing tag" do + results = tree_searcher.search_by(:name, '/div') - it "returns a collection of all nodes that match the id of the tag to search by" + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node3) + end + end end describe "search_descendents" do From 3c11d7f37e3a3d8e21268eb42e2b26ff4def3559 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 19:03:04 +0800 Subject: [PATCH 089/104] fixed bugs for #search_by, for searching by tags --- lib/tree_searcher.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index 43d661b..da4a5ad 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -8,7 +8,7 @@ def initialize(tree = nil) def search_by(attribute, value) case attribute when :name - regex = /^<#{value}.*>$/ + regex = /^<#{value}.*>$/i node_type = value.start_with?("/") ? :close_tag : :open_tag when :text @@ -34,9 +34,9 @@ def find_nodes_by(node_type, regex, value) queue << self.tree.document until queue.empty? current_node = queue.shift - next unless current_node.type == node_type + # next unless current_node.type == node_type - collection << current_node if current_node.content.match(regex) + collection << current_node if current_node.content.match(regex) && current_node.type == node_type current_node.children.each { |child| queue << child unless child.nil? } end From 75c87f2e45d3f95f8f836cd91f9cbe853e70fdeb Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 19:04:12 +0800 Subject: [PATCH 090/104] removed unnecessary code --- lib/tree_searcher.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index da4a5ad..858da1c 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -34,7 +34,6 @@ def find_nodes_by(node_type, regex, value) queue << self.tree.document until queue.empty? current_node = queue.shift - # next unless current_node.type == node_type collection << current_node if current_node.content.match(regex) && current_node.type == node_type From eb9c4d88d0b0548794deadffdf5b69f810889104 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 19:13:46 +0800 Subject: [PATCH 091/104] added specs for searching nodes by text --- spec/tree_searcher_spec.rb | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 9b397e1..30b01d8 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -40,7 +40,7 @@ tree = DOMTree.new(node1) tree_searcher = TreeSearcher.new(tree) - context "search for nodes by their tag name" do + context "searching for nodes by their tag name" do it "returns the correct nodes when searching for an opening tag" do results = tree_searcher.search_by(:name, 'div') @@ -57,6 +57,23 @@ expect(results.first).to eq(node3) end end + + context "searching for nodes by text" do + it "returns the correct nodes when searching by text" do + results = tree_searcher.search_by(:text, "text right here") + + expect(results).to be_a(Array) + expect(results.length).to eq(2) + expect(results).to include(node2) + expect(results).to include(node5) + + results = tree_searcher.search_by(:text, "P TEXT") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results).to include(node5) + end + end end describe "search_descendents" do From 99410d3419252fbedf462aa186247c89ae86f69b Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 19:14:35 +0800 Subject: [PATCH 092/104] modified #search_by to add searching by text --- lib/tree_searcher.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index 858da1c..35e2d53 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -11,7 +11,8 @@ def search_by(attribute, value) regex = /^<#{value}.*>$/i node_type = value.start_with?("/") ? :close_tag : :open_tag when :text - + regex = /#{value}/i + node_type = :text when :id when :class From 992ce76ecd5e93ce8f4a595660d94f83cfa7435b Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 19:23:28 +0800 Subject: [PATCH 093/104] added specs/behavior for searching nodes by id --- lib/tree_searcher.rb | 3 ++- spec/tree_searcher_spec.rb | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index 35e2d53..a945087 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -14,7 +14,8 @@ def search_by(attribute, value) regex = /#{value}/i node_type = :text when :id - + regex = /id\s*=\s*['"]#{value}['"]/ + node_type = :open_tag when :class else diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 30b01d8..3a7f1f1 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -74,6 +74,16 @@ expect(results).to include(node5) end end + + context "searching for nodes by id" do + it "returns the correct nodes when searching by id" do + results = tree_searcher.search_by(:id, "bar") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end + end end describe "search_descendents" do From 4b5ac57b5d3bc6231ce529da35437b43f1e7ca37 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Sun, 23 Apr 2017 19:24:23 +0800 Subject: [PATCH 094/104] made regex case-insensitive --- lib/tree_searcher.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index a945087..26b3dea 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -14,7 +14,7 @@ def search_by(attribute, value) regex = /#{value}/i node_type = :text when :id - regex = /id\s*=\s*['"]#{value}['"]/ + regex = /id\s*=\s*['"]#{value}['"]/i node_type = :open_tag when :class From 20c00427591db1132510ef6e970b2b93dac09333 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 12:43:55 +0800 Subject: [PATCH 095/104] added specs/behavior for searching nodes by class --- lib/tree_searcher.rb | 3 ++- spec/tree_searcher_spec.rb | 22 +++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index 26b3dea..c9b186e 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -17,7 +17,8 @@ def search_by(attribute, value) regex = /id\s*=\s*['"]#{value}['"]/i node_type = :open_tag when :class - + regex = /class\s*=\s*['"].*?#{value}.*?['"]/i + node_type = :open_tag else puts "Invalid attribute to search for." return nil diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 3a7f1f1..fa3f6b1 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -21,7 +21,7 @@ describe "#search_by" do # setup nodes - node1 = Node.new(:open_tag, "
", 0, nil, []) + node1 = Node.new(:open_tag, "
", 0, nil, []) node2 = Node.new(:text, "Some text right here!", 0, node1, []) node3 = Node.new(:close_tag, "
", 0, node1, []) @@ -84,6 +84,26 @@ expect(results.first).to eq(node1) end end + + context "searching for nodes by class" do + it "returns the correct nodes when searching by class" do + results = tree_searcher.search_by(:class, "foo") + + expect(results).to be_a(Array) + expect(results.length).to eq(2) + expect(results).to include(node1) + expect(results).to include(node4) + + results = tree_searcher.search_by(:class, "baz") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + + results = tree_searcher.search_by(:class, "nope") + expect(results).to eq([]) + end + end end describe "search_descendents" do From 6ce71a32a75647a9d3bb063e7b41281a363f9a6d Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 12:44:22 +0800 Subject: [PATCH 096/104] removed spec --- spec/tree_searcher_spec.rb | 3 --- 1 file changed, 3 deletions(-) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index fa3f6b1..07076ce 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -99,9 +99,6 @@ expect(results).to be_a(Array) expect(results.length).to eq(1) expect(results.first).to eq(node1) - - results = tree_searcher.search_by(:class, "nope") - expect(results).to eq([]) end end end From efe93da2615c9d3cd58380e25a5b962d2c680657 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 12:51:29 +0800 Subject: [PATCH 097/104] added new context to specs --- spec/tree_searcher_spec.rb | 102 +++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 50 deletions(-) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 07076ce..649a18b 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -19,7 +19,7 @@ end end - describe "#search_by" do + context "searching for nodes" do # setup nodes node1 = Node.new(:open_tag, "
", 0, nil, []) node2 = Node.new(:text, "Some text right here!", 0, node1, []) @@ -40,74 +40,76 @@ tree = DOMTree.new(node1) tree_searcher = TreeSearcher.new(tree) - context "searching for nodes by their tag name" do - it "returns the correct nodes when searching for an opening tag" do - results = tree_searcher.search_by(:name, 'div') + describe "#search_by" do + context "searching for nodes by their tag name" do + it "returns the correct nodes when searching for an opening tag" do + results = tree_searcher.search_by(:name, 'div') - expect(results).to be_a(Array) - expect(results.length).to eq(1) - expect(results.first).to eq(node1) - end + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end - it "returns the correct nodes when searching for a closing tag" do - results = tree_searcher.search_by(:name, '/div') + it "returns the correct nodes when searching for a closing tag" do + results = tree_searcher.search_by(:name, '/div') - expect(results).to be_a(Array) - expect(results.length).to eq(1) - expect(results.first).to eq(node3) + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node3) + end end - end - context "searching for nodes by text" do - it "returns the correct nodes when searching by text" do - results = tree_searcher.search_by(:text, "text right here") + context "searching for nodes by text" do + it "returns the correct nodes when searching by text" do + results = tree_searcher.search_by(:text, "text right here") - expect(results).to be_a(Array) - expect(results.length).to eq(2) - expect(results).to include(node2) - expect(results).to include(node5) + expect(results).to be_a(Array) + expect(results.length).to eq(2) + expect(results).to include(node2) + expect(results).to include(node5) - results = tree_searcher.search_by(:text, "P TEXT") + results = tree_searcher.search_by(:text, "P TEXT") - expect(results).to be_a(Array) - expect(results.length).to eq(1) - expect(results).to include(node5) + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results).to include(node5) + end end - end - context "searching for nodes by id" do - it "returns the correct nodes when searching by id" do - results = tree_searcher.search_by(:id, "bar") + context "searching for nodes by id" do + it "returns the correct nodes when searching by id" do + results = tree_searcher.search_by(:id, "bar") - expect(results).to be_a(Array) - expect(results.length).to eq(1) - expect(results.first).to eq(node1) + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end end - end - context "searching for nodes by class" do - it "returns the correct nodes when searching by class" do - results = tree_searcher.search_by(:class, "foo") + context "searching for nodes by class" do + it "returns the correct nodes when searching by class" do + results = tree_searcher.search_by(:class, "foo") - expect(results).to be_a(Array) - expect(results.length).to eq(2) - expect(results).to include(node1) - expect(results).to include(node4) + expect(results).to be_a(Array) + expect(results.length).to eq(2) + expect(results).to include(node1) + expect(results).to include(node4) - results = tree_searcher.search_by(:class, "baz") + results = tree_searcher.search_by(:class, "baz") - expect(results).to be_a(Array) - expect(results.length).to eq(1) - expect(results.first).to eq(node1) + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + end end end - end - describe "search_descendents" do - it "returns the same results as #search_by, but only includes the descendents of the input node" - end + describe "search_descendents" do + it "returns the same results as #search_by, but only includes the descendents of the input node" + end - describe "search_ancestors" do - it "returns the same results as #search_by, but only includes the ancestors of the input node" + describe "search_ancestors" do + it "returns the same results as #search_by, but only includes the ancestors of the input node" + end end end From 3909a4349072f0a17446fddc3ea21a86311cfa02 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 12:59:26 +0800 Subject: [PATCH 098/104] added specs/behavior for #search_descendents method --- lib/tree_searcher.rb | 9 +++++++++ spec/tree_searcher_spec.rb | 17 +++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index c9b186e..ef1a4db 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -27,6 +27,15 @@ def search_by(attribute, value) find_nodes_by(node_type, regex, value) end + def search_descendents(start_node, attribute, value) + # we can make use of the node depth to determine whether + # the nodes returned by our search are children of start_node + + nodes = search_by(attribute, value) + + nodes.select { |node| node.depth > start_node.depth } + end + private def find_nodes_by(node_type, regex, value) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 649a18b..499c49e 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -104,8 +104,21 @@ end end - describe "search_descendents" do - it "returns the same results as #search_by, but only includes the descendents of the input node" + describe "#search_descendents" do + it "returns the same results as #search_by, but only includes the descendents of the input node" do + root = tree_searcher.tree.document + results = tree_searcher.search_descendents(root, :class, "foo") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node4) + + results = tree_searcher.search_descendents(root, :text, "right here") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node5) + end end describe "search_ancestors" do From 9deed65f25aafed6e8fa538632bbb99cea15dbe2 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 12:59:56 +0800 Subject: [PATCH 099/104] edited comment --- spec/tree_searcher_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 499c49e..45ffa4a 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -20,7 +20,7 @@ end context "searching for nodes" do - # setup nodes + # setup nodes for integration test node1 = Node.new(:open_tag, "
", 0, nil, []) node2 = Node.new(:text, "Some text right here!", 0, node1, []) node3 = Node.new(:close_tag, "
", 0, node1, []) From d28f78094a45391f9a25c610a3bfab2c462613b4 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 13:00:10 +0800 Subject: [PATCH 100/104] edited comment --- lib/tree_searcher.rb | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index ef1a4db..6a048c3 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -28,9 +28,6 @@ def search_by(attribute, value) end def search_descendents(start_node, attribute, value) - # we can make use of the node depth to determine whether - # the nodes returned by our search are children of start_node - nodes = search_by(attribute, value) nodes.select { |node| node.depth > start_node.depth } From 61ed7736faf0e6500723ac8665c81f3fe37b0fe5 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 13:05:39 +0800 Subject: [PATCH 101/104] added specs/behavior for #search_ancestors method --- lib/tree_searcher.rb | 6 ++++++ spec/tree_searcher_spec.rb | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/tree_searcher.rb b/lib/tree_searcher.rb index 6a048c3..191af98 100644 --- a/lib/tree_searcher.rb +++ b/lib/tree_searcher.rb @@ -33,6 +33,12 @@ def search_descendents(start_node, attribute, value) nodes.select { |node| node.depth > start_node.depth } end + def search_ancestors(start_node, attribute, value) + nodes = search_by(attribute, value) + + nodes.select { |node| node.depth < start_node.depth } + end + private def find_nodes_by(node_type, regex, value) diff --git a/spec/tree_searcher_spec.rb b/spec/tree_searcher_spec.rb index 45ffa4a..b63af64 100644 --- a/spec/tree_searcher_spec.rb +++ b/spec/tree_searcher_spec.rb @@ -122,7 +122,20 @@ end describe "search_ancestors" do - it "returns the same results as #search_by, but only includes the ancestors of the input node" + it "returns the same results as #search_by, but only includes the ancestors of the input node" do + start_node = node6 + results = tree_searcher.search_ancestors(start_node, :class, "foo") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node1) + + results = tree_searcher.search_ancestors(start_node, :text, "right here") + + expect(results).to be_a(Array) + expect(results.length).to eq(1) + expect(results.first).to eq(node2) + end end end end From ca29bd340c637e764b59dafc505762d34c69739c Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 13:53:15 +0800 Subject: [PATCH 102/104] added DOMRebuilder class --- lib/dom_rebuilder.rb | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 lib/dom_rebuilder.rb diff --git a/lib/dom_rebuilder.rb b/lib/dom_rebuilder.rb new file mode 100644 index 0000000..076ae5b --- /dev/null +++ b/lib/dom_rebuilder.rb @@ -0,0 +1,30 @@ +class DOMRebuilder + attr_reader :tree + + def initialize(tree = nil) + @tree = tree + end + + # print out the DOM tree using DFS + def print_tree(start_node = nil) + output = "" + + start_node = self.tree.document if start_node.nil? + + stack = [] + stack.push(start_node) + + until stack.empty? + current_node = stack.pop + + output << current_node.content + + # it's important to reverse the child nodes first before + # pushing onto the stack, so that they'll be printed out + # in the right order + current_node.children.reverse.each { |child| stack.push(child) } + end + + puts output + end +end From 2238d65223108ededfb99b1855b895fec43b1a64 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 13:53:39 +0800 Subject: [PATCH 103/104] modified test code --- example.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/example.rb b/example.rb index 05da261..cf7171a 100644 --- a/example.rb +++ b/example.rb @@ -1,10 +1,15 @@ if $0 == __FILE__ require_relative './lib/dom_reader' require_relative './lib/node_renderer' + require_relative './lib/dom_rebuilder' dom_reader = DOMReader.new tree = dom_reader.build_tree('./test.html') + node_renderer = NodeRenderer.new(tree) node_renderer.render + + dom_rebuilder = DOMRebuilder.new(tree) + dom_rebuilder.print_tree end From f80a4c74372ed40efdfb4d0d6240569872990608 Mon Sep 17 00:00:00 2001 From: Roy Chen Date: Mon, 24 Apr 2017 14:00:12 +0800 Subject: [PATCH 104/104] modified README --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index 03431dc..17cb21c 100644 --- a/README.md +++ b/README.md @@ -4,3 +4,19 @@ Like leaves on the wind [A data structures, algorithms, file I/O, ruby and regular expression (regex) project from the Viking Code School](http://www.vikingcodeschool.com) Worked on by [Roy Chen](https://github.com/roychen25) + +## Getting Started + +To run this program, fork and clone this repository. + +In the cloned directory, run this command: + +``` +ruby example.rb +``` + +The output includes: + +1. Printing information about the root node of the DOM tree created + +2. Rebuilding the DOM tree into its original format