Class: RDF::Tabular::Dialect

Inherits:

Metadata

Object
Metadata
RDF::Tabular::Dialect

show all

Defined in:: lib/rdf/tabular/metadata.rb

Constant Summary collapse

DEFAULTS = Defaults for dialects

{
  commentPrefix:      false,
  delimiter:          ",".freeze,
  doubleQuote:        true,
  encoding:           "utf-8".freeze,
  header:             true,
  headerRowCount:     1,
  lineTerminators:    :auto,
  quoteChar:          '"'.freeze,
  skipBlankRows:      false,
  skipColumns:        0,
  skipInitialSpace:   false,
  skipRows:           0,
  trim:               true
}.freeze

PROPERTIES =

{
  :@id             => :link,
  :@type           => :atomic,
  commentPrefix:      :atomic,
  delimiter:          :atomic,
  doubleQuote:        :atomic,
  encoding:           :atomic,
  header:             :atomic,
  headerRowCount:     :atomic,
  lineTerminators:    :atomic,
  quoteChar:          :atomic,
  skipBlankRows:      :atomic,
  skipColumns:        :atomic,
  skipInitialSpace:   :atomic,
  skipRows:           :atomic,
  trim:               :atomic,
}.freeze

REQUIRED =

[].freeze

Constants inherited from Metadata

Metadata::DATATYPES, Metadata::INHERITED_DEFAULTS, Metadata::INHERITED_PROPERTIES, Metadata::LOCAL_CONTEXT, Metadata::NAME_SYNTAX

Instance Attribute Summary

Attributes inherited from Metadata

#filenames, #id, #object, #parent, #url

Instance Method Summary collapse

#embedded_metadata(input, metadata, **options) ⇒ Metadata

Extract a new Metadata document from the file or data provided.
#escape_character ⇒ String

escape character.
#headerRowCount ⇒ Integer

default for headerRowCount is zero if header is false.
#trim ⇒ Boolean, String

default for trim comes from skipInitialSpace.

Methods inherited from Metadata

#==, #[], #[]=, #base, #common_properties, #context, #datatype=, #default_value, #describes_file?, #dialect, #dialect=, #each, #each_row, for_input, #has_annotations?, #initialize, #inspect, #normalize!, #normalize_jsonld, open, #root, #set_array_value, #set_nl, #set_property, site_wide_config, #tableSchema=, #tables=, #to_json, #transformations=, #type, #valid?, #valid_natural_language_property?, #validate, #validate!, #verify_compatible!

Constructor Details

This class inherits a constructor from RDF::Tabular::Metadata

Instance Method Details

#embedded_metadata(input, metadata, **options) ⇒ `Metadata`

Extract a new Metadata document from the file or data provided

Parameters:

input (#read, #to_s) —

IO, or file path or URL
metadata (Table) —

used for saving annotations created while extracting metadata
options (Hash{Symbol => Object}) —

any additional options (see RDF::Util::File.open_file)

Options Hash (**options):

:lang, (String) —

language to set in table, if any

Returns:

(Metadata) —

Tabular metadata

See Also:

http://w3c.github.io/csvw/syntax/#parsing

# File 'lib/rdf/tabular/metadata.rb', line 1792

def embedded_metadata(input, metadata, **options)
  options = options.dup
  options.delete(:context) # Don't accidentally use a passed context
  # Normalize input to an IO object
  if input.is_a?(String)
    return ::RDF::Util::File.open_file(input) {|f| embedded_metadata(f, metadata, **options.merge(base: input.to_s))}
  end

  table = {
    "@context" => "http://www.w3.org/ns/csvw",
    "url" => (options.fetch(:base, "")),
    "@type" => "Table",
    "tableSchema" => {
      "@type" => "Schema",
      "columns" => []
    }
  }
  metadata ||= table  # In case the embedded metadata becomes the final metadata
  lang = metadata["lang"] = options[:lang] if options[:lang]
  lang ||= 'und'

  # Set encoding on input
  path = input.base_uri.path rescue ""
  if path.end_with?('.html') || input.respond_to?(:content_type) && input.content_type == 'text/html'
    # Input is HTML; use fragment identfier to find table.
    fragment = RDF::URI(table["url"]).fragment rescue nil
    tab = begin
      # Extract with nokogiri
      require 'nokogiri' unless defined?(:Nokogiri)
      doc = Nokogiri::HTML.parse(input)
      doc.search("##{fragment}").first if fragment
    rescue LoadError
      # Extract with REXML
      # FIXME
    end

    raise Error, "Expected to find HTML table identified by fragment identifer ##{fragment}" unless tab

    # Use rows with <th> to create column titles
    tab.xpath('.//tr').each do |row|
      row.xpath('th').map(&:content).each_with_index do |value, index|
        # Skip columns
        skipCols = skipColumns.to_i
        next if index < skipCols || value.to_s.empty?

        # Trim value
        value.lstrip! if %w(true start).include?(trim.to_s)
        value.rstrip! if %w(true end).include?(trim.to_s)

        # Initialize titles
        columns = table["tableSchema"]["columns"] ||= []
        column = columns[index - skipCols] ||= {
          "titles" => {lang => []},
        }
        column["titles"][lang] << value if value
      end
    end
  else
    csv = ::CSV.new(input, **csv_options)
    (1..skipRows.to_i).each do
      value = csv.shift.join(delimiter)  # Skip initial lines, these form comment annotations
      # Trim value
      value.lstrip! if %w(true start).include?(trim.to_s)
      value.rstrip! if %w(true end).include?(trim.to_s)

      value = value[1..-1].strip if commentPrefix && value.start_with?(commentPrefix)
      (metadata["rdfs:comment"] ||= []) << value unless value.empty?
    end
    log_debug("embedded_metadata") {"notes: #{table["notes"].inspect}"}

    (1..headerRowCount).each do
      row_data = Array(csv.shift)
      Array(row_data).each_with_index do |value, index|
        # Skip columns
        skipCols = skipColumns.to_i
        next if index < skipCols || value.to_s.empty?

        # Trim value
        value.lstrip! if %w(true start).include?(trim.to_s)
        value.rstrip! if %w(true end).include?(trim.to_s)

        # Initialize titles
        columns = table["tableSchema"]["columns"] ||= []
        column = columns[index - skipCols] ||= {
          "titles" => {lang => []},
        }
        column["titles"][lang] << value
      end
    end
  end
  log_debug("embedded_metadata") {"table: #{table.inspect}"}
  input.rewind if input.respond_to?(:rewind)

  Table.new(table, **options.merge(reason: "load embedded metadata: #{table['@id']}"))
end

#escape_character ⇒ `String`

escape character

Returns:

(String)



1766
1767
1768

# File 'lib/rdf/tabular/metadata.rb', line 1766

def escape_character
  self.doubleQuote ? '"' : '\\'
end

#headerRowCount ⇒ `Integer`

default for headerRowCount is zero if header is false

Returns:

(Integer)



1772
1773
1774

# File 'lib/rdf/tabular/metadata.rb', line 1772

def headerRowCount
  object.fetch(:headerRowCount, self.header ? 1 : 0)
end

#trim ⇒ `Boolean`, `String`

default for trim comes from skipInitialSpace

Returns:

(Boolean, String)



1778
1779
1780

# File 'lib/rdf/tabular/metadata.rb', line 1778

def trim
  object.fetch(:trim, self.skipInitialSpace ? 'start' : true)
end

Class: RDF::Tabular::Dialect

Constant Summary collapse

Constants inherited from Metadata

Instance Attribute Summary

Attributes inherited from Metadata

Instance Method Summary collapse

Methods inherited from Metadata

Constructor Details

Instance Method Details

#embedded_metadata(input, metadata, **options) ⇒ Metadata

#escape_character ⇒ String

#headerRowCount ⇒ Integer

#trim ⇒ Boolean, String

#embedded_metadata(input, metadata, **options) ⇒ `Metadata`

#escape_character ⇒ `String`

#headerRowCount ⇒ `Integer`

#trim ⇒ `Boolean`, `String`