diff --git a/lib/article_json/configuration.rb b/lib/article_json/configuration.rb index 77b96860..b502c02d 100644 --- a/lib/article_json/configuration.rb +++ b/lib/article_json/configuration.rb @@ -39,8 +39,9 @@ def register_element_exporters(exporter, type_class_mapping) if !type_class_mapping.is_a?(Hash) || type_class_mapping.keys.any? { |key| !key.is_a? Symbol } || type_class_mapping.values.any? { |value| !value.is_a? Class } - raise ArgumentError, '`type_class_mapping` has to be a Hash with '\ - 'symbolized keys and classes as values but is '\ + + raise ArgumentError, '`type_class_mapping` has to be a Hash with ' \ + 'symbolized keys and classes as values but is ' \ "`#{type_class_mapping.inspect}`" end diff --git a/lib/article_json/elements/heading.rb b/lib/article_json/elements/heading.rb index f9e29bff..bf0c8dd0 100644 --- a/lib/article_json/elements/heading.rb +++ b/lib/article_json/elements/heading.rb @@ -34,4 +34,3 @@ def parse_hash(hash) end end end - diff --git a/lib/article_json/elements/image.rb b/lib/article_json/elements/image.rb index f77a81f3..d99bb949 100644 --- a/lib/article_json/elements/image.rb +++ b/lib/article_json/elements/image.rb @@ -46,4 +46,3 @@ def parse_hash(hash) end end end - diff --git a/lib/article_json/elements/list.rb b/lib/article_json/elements/list.rb index bcf6d070..edfe0a4f 100644 --- a/lib/article_json/elements/list.rb +++ b/lib/article_json/elements/list.rb @@ -34,4 +34,3 @@ def parse_hash(hash) end end end - diff --git a/lib/article_json/elements/paragraph.rb b/lib/article_json/elements/paragraph.rb index 628b0efa..b87a1399 100644 --- a/lib/article_json/elements/paragraph.rb +++ b/lib/article_json/elements/paragraph.rb @@ -36,6 +36,7 @@ def blank? # @return [Integer] def length return 0 if empty? + @content.reduce(0) do |sum, element| sum + (element.respond_to?(:length) ? element.length : 0) end @@ -52,4 +53,3 @@ def parse_hash(hash) end end end - diff --git a/lib/article_json/elements/quote.rb b/lib/article_json/elements/quote.rb index 7eb0028c..786733cd 100644 --- a/lib/article_json/elements/quote.rb +++ b/lib/article_json/elements/quote.rb @@ -38,4 +38,3 @@ def parse_hash(hash) end end end - diff --git a/lib/article_json/elements/text.rb b/lib/article_json/elements/text.rb index 0396f2f0..92676c12 100644 --- a/lib/article_json/elements/text.rb +++ b/lib/article_json/elements/text.rb @@ -44,6 +44,7 @@ def blank? # @return [Integer] def length return 0 if blank? + content.length end alias size length @@ -63,4 +64,3 @@ def parse_hash(hash) end end end - diff --git a/lib/article_json/elements/text_box.rb b/lib/article_json/elements/text_box.rb index 5157c27d..428e5db8 100644 --- a/lib/article_json/elements/text_box.rb +++ b/lib/article_json/elements/text_box.rb @@ -38,4 +38,3 @@ def parse_hash(hash) end end end - diff --git a/lib/article_json/export/amp/custom_element_library_resolver.rb b/lib/article_json/export/amp/custom_element_library_resolver.rb index efceafc5..ef1b2cb6 100644 --- a/lib/article_json/export/amp/custom_element_library_resolver.rb +++ b/lib/article_json/export/amp/custom_element_library_resolver.rb @@ -54,4 +54,3 @@ def custom_element_script_mapping(custom_element_tag) end end end - diff --git a/lib/article_json/export/amp/elements/embed.rb b/lib/article_json/export/amp/elements/embed.rb index ef321622..ea23f7d0 100644 --- a/lib/article_json/export/amp/elements/embed.rb +++ b/lib/article_json/export/amp/elements/embed.rb @@ -42,18 +42,22 @@ def embedded_object # @return [Nokogiri::XML::Element] def youtube_node - create_element('amp-youtube', - 'data-videoid' => @element.embed_id, - width: default_width, - height: default_height) + create_element( + 'amp-youtube', + 'data-videoid' => @element.embed_id, + width: default_width, + height: default_height + ) end # @return [Nokogiri::XML::Element] def vimeo_node - create_element('amp-vimeo', - 'data-videoid' => @element.embed_id, - width: default_width, - height: default_height) + create_element( + 'amp-vimeo', + 'data-videoid' => @element.embed_id, + width: default_width, + height: default_height + ) end # @return [Nokogiri::XML::Element] @@ -61,42 +65,50 @@ def tweet_node # The embed_id of a tweet is stored as "/" but # the `amp-twitter` tag only takes the `tweet_id` part tweet_id = @element.embed_id.split('/').last - create_element('amp-twitter', - 'data-tweetid': tweet_id, - width: default_width, - height: default_height) + create_element( + 'amp-twitter', + 'data-tweetid': tweet_id, + width: default_width, + height: default_height + ) end # @return [Nokogiri::XML::Element] def facebook_node url = "#{@element.oembed_data[:author_url]}/videos/#{@element.embed_id}" - create_element('amp-facebook', - 'data-embedded-as' => 'video', - 'data-href' => url, - width: default_width, - height: default_height) + create_element( + 'amp-facebook', + 'data-embedded-as' => 'video', + 'data-href' => url, + width: default_width, + height: default_height + ) end def soundcloud_node src = Nokogiri::HTML(@element.oembed_data[:html]) - .xpath('//iframe/@src').first.value - track_id = src.match(/tracks%2F(\d+)/)[1] - create_element('amp-soundcloud', - layout: 'fixed-height', - 'data-trackid': track_id, - 'data-visual': true, - width: 'auto', - height: default_height) + .xpath('//iframe/@src').first.value + track_id = src.match(%r{tracks%2F(\d+)})[1] + create_element( + 'amp-soundcloud', + layout: 'fixed-height', + 'data-trackid': track_id, + 'data-visual': true, + width: 'auto', + height: default_height + ) end # @return [Nokogiri::XML::Element] def iframe_node node = Nokogiri::HTML(@element.oembed_data[:html]).xpath('//iframe') - create_element('amp-iframe', - src: node.attribute('src').value, - width: node.attribute('width').value, - height: node.attribute('height').value, - frameborder: '0',) + create_element( + 'amp-iframe', + src: node.attribute('src').value, + width: node.attribute('width').value, + height: node.attribute('height').value, + frameborder: '0' + ) end # @return [String] diff --git a/lib/article_json/export/amp/elements/image.rb b/lib/article_json/export/amp/elements/image.rb index 172414f4..99147c7e 100644 --- a/lib/article_json/export/amp/elements/image.rb +++ b/lib/article_json/export/amp/elements/image.rb @@ -9,11 +9,13 @@ class Image < Base # @return [Nokogiri::HTML::NodeSet] def image_node - create_element('amp-img', - src: @element.source_url, - width: default_width, - height: default_height, - layout: :responsive) + create_element( + 'amp-img', + src: @element.source_url, + width: default_width, + height: default_height, + layout: :responsive + ) end def default_width diff --git a/lib/article_json/export/amp/exporter.rb b/lib/article_json/export/amp/exporter.rb index 64c71f2d..59caa47e 100644 --- a/lib/article_json/export/amp/exporter.rb +++ b/lib/article_json/export/amp/exporter.rb @@ -8,10 +8,11 @@ class Exporter # @return [Array[Symbol]] def custom_element_tags return @custom_element_tags if defined? @custom_element_tags + @custom_element_tags = element_exporters - .flat_map { |element| element.custom_element_tags } - .uniq + .flat_map { |element| element.custom_element_tags } + .uniq end # Return an array with all the javascript libraries needed for some @@ -19,6 +20,7 @@ def custom_element_tags # @return [Array] def amp_libraries return @amp_libraries if defined? @amp_libraries + @amp_libraries = CustomElementLibraryResolver.new(custom_element_tags).script_tags end diff --git a/lib/article_json/export/apple_news/elements/embed.rb b/lib/article_json/export/apple_news/elements/embed.rb index 74ed1de4..5bba7f5f 100644 --- a/lib/article_json/export/apple_news/elements/embed.rb +++ b/lib/article_json/export/apple_news/elements/embed.rb @@ -51,7 +51,7 @@ def caption_text def text @element.caption.map do |child_element| text_exporter.new(child_element) - .export + .export end.join end diff --git a/lib/article_json/export/apple_news/elements/image.rb b/lib/article_json/export/apple_news/elements/image.rb index 3e00f768..f0c98c50 100644 --- a/lib/article_json/export/apple_news/elements/image.rb +++ b/lib/article_json/export/apple_news/elements/image.rb @@ -10,6 +10,7 @@ def export end private + # Image # @return [Hash] def image @@ -48,7 +49,7 @@ def caption_text def text @element.caption.map do |child_element| text_exporter.new(child_element) - .export + .export end.join end end diff --git a/lib/article_json/export/apple_news/elements/paragraph.rb b/lib/article_json/export/apple_news/elements/paragraph.rb index 0493a066..78e1089c 100644 --- a/lib/article_json/export/apple_news/elements/paragraph.rb +++ b/lib/article_json/export/apple_news/elements/paragraph.rb @@ -26,7 +26,7 @@ def text_exporter def text @element.content.map do |child_element| text_exporter.new(child_element) - .export + .export end.join end end diff --git a/lib/article_json/export/apple_news/elements/text.rb b/lib/article_json/export/apple_news/elements/text.rb index 7e888589..345687d3 100644 --- a/lib/article_json/export/apple_news/elements/text.rb +++ b/lib/article_json/export/apple_news/elements/text.rb @@ -6,8 +6,21 @@ class Text < Base include ArticleJSON::Export::Common::HTML::Elements::Base include ArticleJSON::Export::Common::HTML::Elements::Text - UNSUPPORTED_HTML_TAGS = %w[title meta script noscript style link applet object iframe - noframes form select option optgroup + UNSUPPORTED_HTML_TAGS = %w[ + title + meta + script + noscript + style + link + applet + object + iframe + noframes + form + select + option + optgroup ].freeze # A Nokogiri object is returned with`super`, which is is then diff --git a/lib/article_json/export/apple_news/exporter.rb b/lib/article_json/export/apple_news/exporter.rb index eec70865..61b2d853 100644 --- a/lib/article_json/export/apple_news/exporter.rb +++ b/lib/article_json/export/apple_news/exporter.rb @@ -21,7 +21,7 @@ def to_json private # Generate an array with the plain text representation of all elements - # + # # @return [Array] def components @components ||= diff --git a/lib/article_json/export/common/html/elements/embed.rb b/lib/article_json/export/common/html/elements/embed.rb index bd66e335..0e84fc86 100644 --- a/lib/article_json/export/common/html/elements/embed.rb +++ b/lib/article_json/export/common/html/elements/embed.rb @@ -20,7 +20,7 @@ def export private def embed_node - type = @element.embed_type.to_s.tr('_','-') + type = @element.embed_type.to_s.tr('_', '-') create_element(:div, class: "embed #{type}") do |div| div.add_child(embedded_object) end @@ -28,6 +28,7 @@ def embed_node def embedded_object return unavailable_node unless @element.oembed_data + Nokogiri::HTML.fragment(@element.oembed_data[:html]) end diff --git a/lib/article_json/export/common/html/elements/image.rb b/lib/article_json/export/common/html/elements/image.rb index bf537ea3..c5e96126 100644 --- a/lib/article_json/export/common/html/elements/image.rb +++ b/lib/article_json/export/common/html/elements/image.rb @@ -19,7 +19,7 @@ def export # @return [Nokogiri::XML::NodeSet] def figure_node create_element(:figure, node_opts) do |figure| - node = @element&.href ? href_node : image_node + node = @element&.href ? href_node : image_node figure.add_child(node) if @element.caption&.any? figure.add_child(caption_node(:figcaption)) @@ -42,6 +42,7 @@ def href_node # @return [Hash] def node_opts return if floating_class.nil? + { class: floating_class } end end diff --git a/lib/article_json/export/common/html/elements/text.rb b/lib/article_json/export/common/html/elements/text.rb index 8b3a7257..43401802 100644 --- a/lib/article_json/export/common/html/elements/text.rb +++ b/lib/article_json/export/common/html/elements/text.rb @@ -11,6 +11,7 @@ def export return bold_and_italic_node if @element.bold && @element.italic return bold_node if @element.bold return italic_node if @element.italic + content_node end @@ -38,6 +39,7 @@ def bold_and_italic_node # @return [Nokogiri::XML::NodeSet] def content_node return create_text_nodes(@element.content) if @element.href.nil? + create_element(:a, href: @element.href) do |a| a.add_child(create_text_nodes(@element.content)) end diff --git a/lib/article_json/import/google_doc/html/embedded_parser.rb b/lib/article_json/import/google_doc/html/embedded_parser.rb index 260f669d..f2489eb5 100644 --- a/lib/article_json/import/google_doc/html/embedded_parser.rb +++ b/lib/article_json/import/google_doc/html/embedded_parser.rb @@ -104,6 +104,7 @@ def parsers def find_parser(text) text = text.strip.downcase return nil if text.empty? + parsers.find { |klass| klass.matches?(text) } end end diff --git a/lib/article_json/import/google_doc/html/heading_parser.rb b/lib/article_json/import/google_doc/html/heading_parser.rb index f0df3174..5fcc8588 100644 --- a/lib/article_json/import/google_doc/html/heading_parser.rb +++ b/lib/article_json/import/google_doc/html/heading_parser.rb @@ -19,11 +19,11 @@ def content # @return [Integer] def level case @node.name - when 'h1' then 1 - when 'h2' then 2 - when 'h3' then 3 - when 'h4' then 4 - when 'h5' then 5 + when 'h1' then 1 + when 'h2' then 2 + when 'h3' then 3 + when 'h4' then 4 + when 'h5' then 5 end end diff --git a/lib/article_json/import/google_doc/html/image_parser.rb b/lib/article_json/import/google_doc/html/image_parser.rb index d49c0270..7bcdee3c 100644 --- a/lib/article_json/import/google_doc/html/image_parser.rb +++ b/lib/article_json/import/google_doc/html/image_parser.rb @@ -54,8 +54,10 @@ def float # @return [String] def href return if @caption_node.nil? + match = @caption_node.content.strip.match(href_regexp) return if match.nil? + remove_image_link_tag match[:url] end diff --git a/lib/article_json/import/google_doc/html/list_parser.rb b/lib/article_json/import/google_doc/html/list_parser.rb index 74ec8d76..29f45565 100644 --- a/lib/article_json/import/google_doc/html/list_parser.rb +++ b/lib/article_json/import/google_doc/html/list_parser.rb @@ -14,8 +14,8 @@ def initialize(node:, css_analyzer:) # @return [Symbol] def list_type case @node.name - when 'ol' then :ordered - when 'ul' then :unordered + when 'ol' then :ordered + when 'ul' then :unordered end end diff --git a/lib/article_json/import/google_doc/html/node_analyzer.rb b/lib/article_json/import/google_doc/html/node_analyzer.rb index 08adec16..61f93b39 100644 --- a/lib/article_json/import/google_doc/html/node_analyzer.rb +++ b/lib/article_json/import/google_doc/html/node_analyzer.rb @@ -31,6 +31,7 @@ def begins_with?(text) # @return [Boolean] def empty? return @is_empty if defined? @is_empty + @is_empty = node.inner_text.strip.empty? && !image? && !hr? && !br? end @@ -38,6 +39,7 @@ def empty? # @return [Boolean] def heading? return @is_heading if defined? @is_heading + @is_heading = !quote? && !text_box? && %w(h1 h2 h3 h4 h5).include?(node.name) end @@ -52,6 +54,7 @@ def hr? # @return [Boolean] def paragraph? return @is_paragraph if defined? @is_paragraph + @is_paragraph = node.name == 'p' && !empty? && @@ -65,7 +68,8 @@ def paragraph? # @return [Boolean] def list? return @is_list if defined? @is_list - @is_list = %w(ul ol).include?(node.name) + + @is_list = %w[ul ol].include?(node.name) end # Check if the node starts a text box @@ -73,6 +77,7 @@ def list? # @return [Boolean] def text_box? return @is_text_box if defined? @is_text_box + @is_text_box = begins_with?('textbox:') || begins_with?('highlight:') end @@ -81,6 +86,7 @@ def text_box? # @return [Boolean] def quote? return @is_quote if defined? @is_quote + @is_quote = has_text?('quote:') end @@ -88,6 +94,7 @@ def quote? # @return [Boolean] def image? return @is_image if defined? @is_image + @is_image = image_url? || node.xpath('.//img').length > 0 end @@ -105,6 +112,7 @@ def image_url? # @return [Boolean] def embed? return @is_embed if defined? @is_embed + @is_embed = EmbeddedParser.supported?(node) end @@ -113,6 +121,7 @@ def embed? # @return [Boolean] def br? return @is_br if defined? @is_br + @is_br = node.name == 'br' || only_includes_brs? end @@ -129,6 +138,7 @@ def type return :quote if quote? return :image if image? return :embed if embed? + :unknown end @@ -138,9 +148,11 @@ def type # @return [Boolean] def only_includes_brs? return false unless node.inner_text.strip.empty? + tags = node.children.map(&:name) # Check if it only contains
and text nodes - return false unless tags.all? { |tag| %w(br text).include? tag } + return false unless tags.all? { |tag| %w[br text].include? tag } + # Check if at least one is a `
` node tags.include?('br') end diff --git a/lib/article_json/import/google_doc/html/parser.rb b/lib/article_json/import/google_doc/html/parser.rb index 2fd47997..81e19bb9 100644 --- a/lib/article_json/import/google_doc/html/parser.rb +++ b/lib/article_json/import/google_doc/html/parser.rb @@ -118,6 +118,7 @@ def nodes_until_hr nodes = [] until !body_has_more_nodes? || NodeAnalyzer.new(@body_enumerator.peek).hr? + nodes << @body_enumerator.next end nodes diff --git a/lib/article_json/import/google_doc/html/shared/caption.rb b/lib/article_json/import/google_doc/html/shared/caption.rb index a5d53f15..4cfca1a0 100644 --- a/lib/article_json/import/google_doc/html/shared/caption.rb +++ b/lib/article_json/import/google_doc/html/shared/caption.rb @@ -8,6 +8,7 @@ module Caption # @return [Array[ArticleJSON::Elements::Text]] def caption return [] if no_caption? + ArticleJSON::Import::GoogleDoc::HTML::TextParser.extract( node: @caption_node, css_analyzer: @css_analyzer diff --git a/lib/article_json/import/google_doc/html/shared/float.rb b/lib/article_json/import/google_doc/html/shared/float.rb index d201ca85..d181b0ca 100644 --- a/lib/article_json/import/google_doc/html/shared/float.rb +++ b/lib/article_json/import/google_doc/html/shared/float.rb @@ -8,9 +8,11 @@ module Float # @return [Symbol] def float return unless @float_node.has_attribute?('class') + node_class = @float_node.attribute('class').value || '' return :right if @css_analyzer.right_aligned?(node_class) return :left if @css_analyzer.left_aligned?(node_class) + nil end end diff --git a/lib/article_json/import/google_doc/html/text_box_parser.rb b/lib/article_json/import/google_doc/html/text_box_parser.rb index 9e19471c..ce7962fb 100644 --- a/lib/article_json/import/google_doc/html/text_box_parser.rb +++ b/lib/article_json/import/google_doc/html/text_box_parser.rb @@ -10,7 +10,7 @@ class TextBoxParser # May contain tags, too. # @param [Array[Nokogiri::HTML::Node]] nodes # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer - def initialize(type_node: ,nodes:, css_analyzer:) + def initialize(type_node:, nodes:, css_analyzer:) @nodes = nodes.reject { |node| NodeAnalyzer.new(node).empty? } @css_analyzer = css_analyzer @@ -32,6 +32,7 @@ def tags match = /(.*?)[\s\u00A0]+\[(?.*)\]/ .match(@type_node.inner_text) return [] unless match + match[:tags].split(' ') end diff --git a/lib/article_json/import/google_doc/html/text_parser.rb b/lib/article_json/import/google_doc/html/text_parser.rb index 5d7be76c..7ab262f9 100644 --- a/lib/article_json/import/google_doc/html/text_parser.rb +++ b/lib/article_json/import/google_doc/html/text_parser.rb @@ -43,6 +43,7 @@ def href if @node.name == 'span' && @node.first_element_child&.name == 'a' && @node.first_element_child&.has_attribute?('href') + strip_google_redirect( @node.first_element_child.attribute('href').value ) @@ -68,6 +69,7 @@ class << self def extract(node:, css_analyzer:) node.children.map do |child_node| next if NodeAnalyzer.new(child_node).empty? + new(node: child_node, css_analyzer: css_analyzer).element end.compact end diff --git a/lib/article_json/utils/additional_element_placer.rb b/lib/article_json/utils/additional_element_placer.rb index 8c5403c3..3597b3be 100644 --- a/lib/article_json/utils/additional_element_placer.rb +++ b/lib/article_json/utils/additional_element_placer.rb @@ -40,6 +40,7 @@ def initialize(elements, additional_elements) # @return [Array[ArticleJSON::Elements::Base|Object]] def merge_elements return @additional_elements if @elements.nil? || @elements.empty? + remaining_elements = @additional_elements.dup next_in = insert_next_element_in(0, remaining_elements) characters_passed = 0 @@ -48,6 +49,7 @@ def merge_elements .each_with_object([]) do |(element, next_element), result| result << element next if remaining_elements.empty? + if element.respond_to?(:length) characters_passed += element.length next_in -= element.length diff --git a/lib/article_json/utils/o_embed_resolver/base.rb b/lib/article_json/utils/o_embed_resolver/base.rb index 2052152b..4859a7bc 100644 --- a/lib/article_json/utils/o_embed_resolver/base.rb +++ b/lib/article_json/utils/o_embed_resolver/base.rb @@ -23,8 +23,10 @@ def oembed_data def unavailable_message [ ArticleJSON::Elements::Text.new(content: "The #{name} "), - ArticleJSON::Elements::Text.new(content: source_url, - href: source_url), + ArticleJSON::Elements::Text.new( + content: source_url, + href: source_url + ), ArticleJSON::Elements::Text.new(content: ' is not available.'), ] end @@ -44,14 +46,13 @@ def source_url # @return [Hash|nil] def parsed_api_response return @api_response if defined? @api_response + @api_response = begin uri = URI.parse(oembed_url) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = (uri.scheme == 'https') response = http.request(Net::HTTP::Get.new(uri, http_headers)) - if response.kind_of? Net::HTTPSuccess - JSON.parse(response.body, symbolize_names: true) - end + JSON.parse(response.body, symbolize_names: true) if response.is_a? Net::HTTPSuccess rescue Net::ProtocolError, JSON::ParserError nil end diff --git a/lib/article_json/utils/o_embed_resolver/facebook_video.rb b/lib/article_json/utils/o_embed_resolver/facebook_video.rb index 2e2b1805..8dafd1a1 100644 --- a/lib/article_json/utils/o_embed_resolver/facebook_video.rb +++ b/lib/article_json/utils/o_embed_resolver/facebook_video.rb @@ -12,7 +12,7 @@ def name # @return [String] def oembed_url "https://graph.facebook.com/v9.0/oembed_video?url=#{source_url}" \ - "&access_token=#{access_token}" + "&access_token=#{access_token}" end # The video URL of the element diff --git a/lib/article_json/utils/o_embed_resolver/slideshare.rb b/lib/article_json/utils/o_embed_resolver/slideshare.rb index 2751f605..048cae8b 100644 --- a/lib/article_json/utils/o_embed_resolver/slideshare.rb +++ b/lib/article_json/utils/o_embed_resolver/slideshare.rb @@ -11,8 +11,8 @@ def name # The URL for the oembed API call # @return [String] def oembed_url - 'https://www.slideshare.net/api/oembed/2?format=json&url='\ - "#{source_url}" + 'https://www.slideshare.net/api/oembed/2?format=json&url=' \ + "#{source_url}" end # The URL of the slideshow