Class: RDF::Normalize::RDFC10

Inherits:
Base
  • Object
show all
Includes:
Enumerable, Util::Logger
Defined in:
lib/rdf/normalize/rdfc10.rb

Direct Known Subclasses

URGNA2012

Defined Under Namespace

Classes: IdentifierIssuer, NormalizationState

Instance Attribute Summary

Attributes inherited from Base

#dataset

Instance Method Summary collapse

Methods included from Enumerable

#canonicalize

Constructor Details

#initialize(enumerable, **options) ⇒ RDF::Enumerable

Create an enumerable with grounded nodes

raise [RuntimeError] if the maximum number of levels of recursion is exceeded.

Parameters:

  • enumerable (RDF::Enumerable)
  • options (Hash)

    a customizable set of options

Options Hash (**options):

  • :max_calls (Integer) — default: 40

    Maximum number of calls allowed for recursive blank node labeling, as a multiple of the total number of blank nodes in the dataset.



24
25
26
27
28
29
30
# File 'lib/rdf/normalize/rdfc10.rb', line 24

def initialize(enumerable, **options)
  @dataset, @options = enumerable, options
  @options[:hash_algorithm] ||= :SHA256
  unless %i{MD5 SHA1 SHA2 SHA256 SHA384 SHA512}.include?(@options[:hash_algorithm])
    raise UnknownHashAlgorithm, "UnknownHashAlgorithm: #{@options[:hash_algorithm].inspect}. Use one of MD5, SHA1, SHA2, SHA256, SHA384, or SHA512"
  end
end

Instance Method Details

#each(&block) ⇒ Object

Yields each normalized statement



33
34
35
36
37
38
# File 'lib/rdf/normalize/rdfc10.rb', line 33

def each(&block)
  ns = NormalizationState.new(**@options)
  log_debug("ca:")
  log_debug("  log point", "Entering the canonicalization function (4.5.3).")
  log_depth(depth: 2) {normalize_statements(ns, &block)}
end

#normalize_statements(ns, &block) ⇒ Object (protected)



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/rdf/normalize/rdfc10.rb', line 53

def normalize_statements(ns, &block)
  # Step 2: Map BNodes to the statements they are used by
  dataset.each_statement do |statement|
    statement.to_quad.compact.select(&:node?).each do |node|
      ns.add_statement(node, statement)
    end
  end
  log_debug("ca.2:")
  log_debug("  log point", "Extract quads for each bnode (4.5.3 (2)).")
  log_debug("  Bnode to quads:")
  if logger && logger.level == 0
    ns.bnode_to_statements.each do |bn, statements|
      log_debug("    #{bn.id}:")
      statements.each do |s|
        log_debug {"      - #{s.to_nquads.strip}"}
      end
    end
  end

  ns.hash_to_bnodes = {}

  # Step 3: Calculate hashes for first degree nodes
  log_debug("ca.3:")
  log_debug("  log point", "Calculated first degree hashes (4.5.3 (3)).")
  log_debug("  with:")
  ns.bnode_to_statements.each_key do |node|
    log_debug("    - identifier") {node.id}
    log_debug("      h1dq:")
    hash = log_depth(depth: 8) {ns.hash_first_degree_quads(node)}
    ns.add_bnode_hash(node, hash)
  end

  # Step 4: Create canonical replacements for hashes mapping to a single node
  log_debug("ca.4:")
  log_debug("  log point", "Create canonical replacements for hashes mapping to a single node (4.5.3 (4)).")
  log_debug("  with:") unless ns.hash_to_bnodes.empty?
  ns.hash_to_bnodes.keys.sort.each do |hash|
    identifier_list = ns.hash_to_bnodes[hash]
    next if identifier_list.length > 1
    node = identifier_list.first
    id = ns.canonical_issuer.issue_identifier(node)
    log_debug("    - identifier") {node.id}
    log_debug("      hash", hash)
    log_debug("      canonical label", id)
    ns.hash_to_bnodes.delete(hash)
  end

  # Step 5: Iterate over hashs having more than one node
  log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
  log_debug("  log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
  log_debug("  with:") unless ns.hash_to_bnodes.empty?

  # Initialize the number of calls allowed to hash_n_degree_quads
  # as a multiple of the total number of blank nodes in the dataset.
  ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)

  ns.hash_to_bnodes.keys.sort.each do |hash|
    identifier_list = ns.hash_to_bnodes[hash]

    log_debug("    - hash", hash) 
    log_debug("      identifier list") {identifier_list.map(&:id).to_json(indent: ' ')}
    hash_path_list = []

    # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
    log_debug("      ca.5.2:")
    log_debug("        log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5.2)).")
    log_debug("        with:") unless identifier_list.empty?
    identifier_list.each do |identifier|
      next if ns.canonical_issuer.issued.include?(identifier)
      temporary_issuer = IdentifierIssuer.new("b")
      temporary_issuer.issue_identifier(identifier)
      log_debug("          - identifier") {identifier.id}
      hash_path_list << log_depth(depth: 12) {ns.hash_n_degree_quads(identifier, temporary_issuer)}
    end

    # Create canonical replacements for nodes
    log_debug("      ca.5.3:") unless hash_path_list.empty?
    log_debug("        log point", "Canonical identifiers for temporary identifiers (4.5.3 (5.3)).")
    log_debug("        issuer:") unless hash_path_list.empty?
    hash_path_list.sort_by(&:first).each do |result, issuer|
      issuer.issued.each do |node|
        id = ns.canonical_issuer.issue_identifier(node)
        log_debug("          - blank node") {node.id}
        log_debug("            canonical identifier", id)
      end
    end
  end

  # Step 6: Yield statements using BNodes from canonical replacements
  if block_given?
    dataset.each_statement do |statement|
      if statement.has_blank_nodes?
        quad = statement.to_quad.compact.map do |term|
          term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
        end
        block.call RDF::Statement.from(quad)
      else
        block.call statement
      end
    end
  end

  log_debug("ca.6:")
  log_debug("  log point", "Issued identifiers map (4.4.3 (6)).")
  log_debug("  issued identifiers map: #{ns.canonical_issuer.inspect}")
  dataset
end

#to_hashHash{String => String}

Returns a map from input blank node identifiers to canonical blank node identifiers.

Returns:

  • (Hash{String => String})


43
44
45
46
47
48
49
# File 'lib/rdf/normalize/rdfc10.rb', line 43

def to_hash
  ns = NormalizationState.new(**@options)
  log_debug("ca:")
  log_debug("  log point", "Entering the canonicalization function (4.5.3).")
  log_depth(depth: 2) {normalize_statements(ns)}
  ns.canonical_issuer.to_hash
end