Class: RDF::Tabular::Dialect

Inherits:
Metadata show all
Defined in:
lib/rdf/tabular/metadata.rb

Constant Summary collapse

DEFAULTS =

Defaults for dialects

{
  commentPrefix:      false,
  delimiter:          ",".freeze,
  doubleQuote:        true,
  encoding:           "utf-8".freeze,
  header:             true,
  headerRowCount:     1,
  lineTerminators:    :auto,
  quoteChar:          '"'.freeze,
  skipBlankRows:      false,
  skipColumns:        0,
  skipInitialSpace:   false,
  skipRows:           0,
  trim:               true
}.freeze
PROPERTIES =
{
  :@id             => :link,
  :@type           => :atomic,
  commentPrefix:      :atomic,
  delimiter:          :atomic,
  doubleQuote:        :atomic,
  encoding:           :atomic,
  header:             :atomic,
  headerRowCount:     :atomic,
  lineTerminators:    :atomic,
  quoteChar:          :atomic,
  skipBlankRows:      :atomic,
  skipColumns:        :atomic,
  skipInitialSpace:   :atomic,
  skipRows:           :atomic,
  trim:               :atomic,
}.freeze
REQUIRED =
[].freeze

Constants inherited from Metadata

Metadata::DATATYPES, Metadata::INHERITED_DEFAULTS, Metadata::INHERITED_PROPERTIES, Metadata::LOCAL_CONTEXT, Metadata::NAME_SYNTAX

Instance Attribute Summary

Attributes inherited from Metadata

#filenames, #id, #object, #parent, #url

Instance Method Summary collapse

Methods inherited from Metadata

#==, #[], #[]=, #base, #common_properties, #context, #datatype=, #default_value, #describes_file?, #dialect, #dialect=, #each, #each_row, for_input, #has_annotations?, #initialize, #inspect, #normalize!, #normalize_jsonld, open, #root, #set_array_value, #set_nl, #set_property, site_wide_config, #tableSchema=, #tables=, #to_json, #transformations=, #type, #valid?, #valid_natural_language_property?, #validate, #validate!, #verify_compatible!

Constructor Details

This class inherits a constructor from RDF::Tabular::Metadata

Instance Method Details

#embedded_metadata(input, metadata, **options) ⇒ Metadata

Extract a new Metadata document from the file or data provided

Parameters:

  • input (#read, #to_s)

    IO, or file path or URL

  • metadata (Table)

    used for saving annotations created while extracting metadata

  • options (Hash{Symbol => Object})

    any additional options (see RDF::Util::File.open_file)

Options Hash (**options):

  • :lang, (String)

    language to set in table, if any

Returns:

See Also:



1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
# File 'lib/rdf/tabular/metadata.rb', line 1792

def (input, , **options)
  options = options.dup
  options.delete(:context) # Don't accidentally use a passed context
  # Normalize input to an IO object
  if input.is_a?(String)
    return ::RDF::Util::File.open_file(input) {|f| (f, , **options.merge(base: input.to_s))}
  end

  table = {
    "@context" => "http://www.w3.org/ns/csvw",
    "url" => (options.fetch(:base, "")),
    "@type" => "Table",
    "tableSchema" => {
      "@type" => "Schema",
      "columns" => []
    }
  }
   ||= table  # In case the embedded metadata becomes the final metadata
  lang = ["lang"] = options[:lang] if options[:lang]
  lang ||= 'und'

  # Set encoding on input
  path = input.base_uri.path rescue ""
  if path.end_with?('.html') || input.respond_to?(:content_type) && input.content_type == 'text/html'
    # Input is HTML; use fragment identfier to find table.
    fragment = RDF::URI(table["url"]).fragment rescue nil
    tab = begin
      # Extract with nokogiri
      require 'nokogiri' unless defined?(:Nokogiri)
      doc = Nokogiri::HTML.parse(input)
      doc.search("##{fragment}").first if fragment
    rescue LoadError
      # Extract with REXML
      # FIXME
    end

    raise Error, "Expected to find HTML table identified by fragment identifer ##{fragment}" unless tab

    # Use rows with <th> to create column titles
    tab.xpath('.//tr').each do |row|
      row.xpath('th').map(&:content).each_with_index do |value, index|
        # Skip columns
        skipCols = skipColumns.to_i
        next if index < skipCols || value.to_s.empty?

        # Trim value
        value.lstrip! if %w(true start).include?(trim.to_s)
        value.rstrip! if %w(true end).include?(trim.to_s)

        # Initialize titles
        columns = table["tableSchema"]["columns"] ||= []
        column = columns[index - skipCols] ||= {
          "titles" => {lang => []},
        }
        column["titles"][lang] << value if value
      end
    end
  else
    csv = ::CSV.new(input, **csv_options)
    (1..skipRows.to_i).each do
      value = csv.shift.join(delimiter)  # Skip initial lines, these form comment annotations
      # Trim value
      value.lstrip! if %w(true start).include?(trim.to_s)
      value.rstrip! if %w(true end).include?(trim.to_s)

      value = value[1..-1].strip if commentPrefix && value.start_with?(commentPrefix)
      (["rdfs:comment"] ||= []) << value unless value.empty?
    end
    log_debug("embedded_metadata") {"notes: #{table["notes"].inspect}"}

    (1..headerRowCount).each do
      row_data = Array(csv.shift)
      Array(row_data).each_with_index do |value, index|
        # Skip columns
        skipCols = skipColumns.to_i
        next if index < skipCols || value.to_s.empty?

        # Trim value
        value.lstrip! if %w(true start).include?(trim.to_s)
        value.rstrip! if %w(true end).include?(trim.to_s)

        # Initialize titles
        columns = table["tableSchema"]["columns"] ||= []
        column = columns[index - skipCols] ||= {
          "titles" => {lang => []},
        }
        column["titles"][lang] << value
      end
    end
  end
  log_debug("embedded_metadata") {"table: #{table.inspect}"}
  input.rewind if input.respond_to?(:rewind)

  Table.new(table, **options.merge(reason: "load embedded metadata: #{table['@id']}"))
end

#escape_characterString

escape character

Returns:

  • (String)


1766
1767
1768
# File 'lib/rdf/tabular/metadata.rb', line 1766

def escape_character
  self.doubleQuote ? '"' : '\\'
end

#headerRowCountInteger

default for headerRowCount is zero if header is false

Returns:

  • (Integer)


1772
1773
1774
# File 'lib/rdf/tabular/metadata.rb', line 1772

def headerRowCount
  object.fetch(:headerRowCount, self.header ? 1 : 0)
end

#trimBoolean, String

default for trim comes from skipInitialSpace

Returns:

  • (Boolean, String)


1778
1779
1780
# File 'lib/rdf/tabular/metadata.rb', line 1778

def trim
  object.fetch(:trim, self.skipInitialSpace ? 'start' : true)
end