Module: RDF::Microdata::Reader::Nokogiri

Defined in:
lib/rdf/microdata/reader/nokogiri.rb

Overview

Nokogiri implementation of an HTML parser.

Defined Under Namespace

Classes: NodeProxy, NodeSetProxy

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.librarySymbol

Returns the name of the underlying XML library.

Returns:

  • (Symbol)


12
13
14
# File 'lib/rdf/microdata/reader/nokogiri.rb', line 12

def self.library
  :nokogiri
end

Instance Method Details

#doc_base(base) ⇒ String

Find value of document base

Parameters:

  • base (String)

    Existing base from URI or :base_uri

Returns:

  • (String)


224
225
226
227
228
229
# File 'lib/rdf/microdata/reader/nokogiri.rb', line 224

def doc_base(base)
  # find if the document has a base element
  base_el = @doc.at_css("html>head>base") 
  base = base_el.attribute("href").to_s.split("#").first if base_el
  base
end

#doc_errorsObject

Document errors



213
214
215
216
217
# File 'lib/rdf/microdata/reader/nokogiri.rb', line 213

def doc_errors
  @doc.errors.reject do |e|
    e.to_s =~ %r{(The doctype must be the first token in the document)|(Expected a doctype token)|(Unexpected '\?' where start tag name is expected)}
  end
end

#find_element_by_id(id) ⇒ Object

Look up an element in the document by id



241
242
243
# File 'lib/rdf/microdata/reader/nokogiri.rb', line 241

def find_element_by_id(id)
  (e = @doc.at_css("##{id}")) && NodeProxy.new(e)
end

#getItemsObject

Based on Microdata element.getItems



235
236
237
# File 'lib/rdf/microdata/reader/nokogiri.rb', line 235

def getItems
  @doc.css('[itemscope]').select {|el| !el.has_attribute?('itemprop')}.map {|n| NodeProxy.new(n)}
end

#initialize_html(input, **options)

This method returns an undefined value.

Initializes the underlying XML library.

Parameters:

  • options (Hash{Symbol => Object})


181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/rdf/microdata/reader/nokogiri.rb', line 181

def initialize_html(input, **options)
  require 'nokogiri' unless defined?(::Nokogiri)
  @doc = case input
  when ::Nokogiri::XML::Document
    input
  else
    # Try to detect charset from input
    options[:encoding] ||= input.charset if input.respond_to?(:charset)
    
    # Otherwise, default is utf-8
    options[:encoding] ||= 'utf-8'
    options[:encoding] = options[:encoding].to_s if options[:encoding]

    begin
      input = input.read if input.respond_to?(:read)
      ::Nokogiri::HTML5(input.force_encoding(options[:encoding]), max_parse_errors: 1000)
    rescue LoadError, NoMethodError
      ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
    end
  end
end

#rootObject

Return proxy for document root



207
208
209
# File 'lib/rdf/microdata/reader/nokogiri.rb', line 207

def root
  @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
end