Class: Kiba::Extend::Transforms::Deduplicate::Table

Inherits:
Object
  • Object
show all
Defined in:
lib/kiba/extend/transforms/deduplicate/table.rb

Overview

Note:

This transform runs in memory, so for very large sources, it may take a long time or fail. In this case, use a combination of Flag and FilterRows::FieldEqualTo

Given a field on which to deduplicate, removes duplicate rows from table. The first row of each set of rows containing the same value in the given field. Various additional functionality is configurable via the arguments passed to the transform. See examples and #initialize for details.

Tip: Use CombineValues::FromFieldsWithDelimiter or CombineValues::FullRecord to create a combined field on which to deduplicate

Examples:

With defaults

# Used in pipeline as:
# transform Deduplicate::Table, field: :combine
xform = Deduplicate::Table.new(field: :combine)

input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
]
expect(result).to eq(expected)

When delete_field == true

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true
xform = Deduplicate::Table.new(field: :combine, delete_field: true)

input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f"},
  {foo: "c", bar: "d", baz: "g"},
  {foo: "c", bar: "e", baz: "h"},
]
expect(result).to eq(expected)

Gathering examples

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true,
#   example_source_field: :baz,
#   max_examples: 2,
#   example_target_field: :ex,
#   example_delim: " ; "
xform = Deduplicate::Table.new(field: :combine, delete_field: true,
  example_source_field: :baz, max_examples: 2,
  example_target_field: :ex, example_delim: " ; ")

input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f", ex: "f ; f"},
  {foo: "c", bar: "d", baz: "g", ex: "g ; i"},
  {foo: "c", bar: "e", baz: "h", ex: "h"},
]
expect(result).to eq(expected)

Reporting occurrence count

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true,
#   example_source_field: :baz,
#   max_examples: 2,
#   include_occs: true
xform = Deduplicate::Table.new(field: :combine, delete_field: true,
  example_source_field: :baz, max_examples: 2,
  include_occs: true
)
input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f", examples: "f|f", occurrences: 2},
  {foo: "c", bar: "d", baz: "g", examples: "g|i", occurrences: 4},
  {foo: "c", bar: "e", baz: "h", examples: "h", occurrences: 1},
]
expect(result).to eq(expected)

Compiling unique field values into one field

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true,
#   compile_uniq_fieldvals: true,
#   compile_delim: ", "
xform = Deduplicate::Table.new(field: :combine, delete_field: true,
  compile_uniq_fieldvals: true, compile_delim: ", ")
input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "d", baz: "", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: nil, combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"},
  {foo: "e", bar: "f", baz: nil, combine: "e f"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f"},
  {foo: "c", bar: "d", baz: "g, i, j, k"},
  {foo: "c", bar: "e", baz: "h"},
  {foo: "e", bar: "f", baz: ""}
]
expect(result).to eq(expected)

Since:

  • 2.2.0

Instance Method Summary collapse

Constructor Details

#initialize(field:, delete_field: false, example_source_field: nil, max_examples: 10, example_target_field: :examples, example_delim: Kiba::Extend.delim, include_occs: false, occs_target_field: :occurrences, compile_uniq_fieldvals: false, compile_delim: Kiba::Extend.delim) ⇒ Table

Returns a new instance of Table.

Parameters:

  • field (Symbol)

    name of field on which to deduplicate

  • delete_field (Boolean) (defaults to: false)

    whether to delete the deduplication field after doing deduplication

  • example_source_field (nil, Symbol) (defaults to: nil)

    field containing values to be compiled as examples

  • max_examples (Integer) (defaults to: 10)

    maximum number of example values to return

  • example_target_field (Symbol) (defaults to: :examples)

    name of field in which to report example values

  • example_delim (String) (defaults to: Kiba::Extend.delim)

    used to join multiple example values

  • include_occs (Boolean) (defaults to: false)

    whether to report number of occurrences of each field value being deduplicated on

  • occs_target_field (Symbol) (defaults to: :occurrences)

    name of field in which to report occurrences

  • compile_uniq_fieldvals (Boolean) (defaults to: false)

    whether to compile all unique values of each field across duplicate row set into the row that is kept. Values of each field are concatenated in order of row occurrence, then deduplicated

  • compile_delim (String) (defaults to: Kiba::Extend.delim)

    used to join compiled unique field values

Since:

  • 2.2.0



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/kiba/extend/transforms/deduplicate/table.rb', line 183

def initialize(field:, delete_field: false, example_source_field: nil,
  max_examples: 10, example_target_field: :examples,
  example_delim: Kiba::Extend.delim,
  include_occs: false, occs_target_field: :occurrences,
  compile_uniq_fieldvals: false, compile_delim: Kiba::Extend.delim)
  @field = field
  @deduper = {}
  @delete = delete_field
  @example = example_source_field
  @max_examples = max_examples
  @ex_target = example_target_field
  @delim = example_delim
  @occs = include_occs
  @occ_target = occs_target_field
  @compile_uniq_fieldvals = compile_uniq_fieldvals
  @compile_delim = compile_delim
end

Instance Method Details

#closeObject

Since:

  • 2.2.0



213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/kiba/extend/transforms/deduplicate/table.rb', line 213

def close
  deduper.values.each do |hash|
    row = hash[:row]
    add_example_field(row, hash) if example
    row[occ_target] = hash[:occs] if occs
    row.delete(field) if delete
    if compile_uniq_fieldvals
      row = row.map do |fld, _val|
        next if fld == field

        [fld, hash[:fieldvals][fld].join(compile_delim)]
      end.compact.to_h
    end
    yield row
  end
end

#process(row) ⇒ Object

Parameters:

  • row (Hash{ Symbol => String, nil })

Since:

  • 2.2.0



202
203
204
205
206
207
208
209
210
211
# File 'lib/kiba/extend/transforms/deduplicate/table.rb', line 202

def process(row)
  field_val = row.fetch(field, nil)
  return if field_val.blank?

  get_row(field_val, row)
  get_occ(field_val, row) if occs
  get_example(field_val, row) if example
  compile_values(field_val, row) if compile_uniq_fieldvals
  nil
end