Class: Kiba::Extend::Transforms::Deduplicate::Table

Inherits:
Object
  • Object
show all
Defined in:
lib/kiba/extend/transforms/deduplicate/table.rb

Overview

Note:

This transform runs in memory, so for very large sources, it may take a long time or fail. In this case, use a combination of Flag and FilterRows::FieldEqualTo

Given a field on which to deduplicate, removes duplicate rows from table. The first row of each set of rows containing the same value in the given field. Various additional functionality is configurable via the arguments passed to the transform. See examples and #initialize for details.

Tip: Use CombineValues::FromFieldsWithDelimiter or CombineValues::FullRecord to create a combined field on which to deduplicate

Examples:

With defaults

# Used in pipeline as:
# transform Deduplicate::Table, field: :combine
xform = Deduplicate::Table.new(field: :combine)

input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
]
expect(result).to eq(expected)

When delete_field == true

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true
xform = Deduplicate::Table.new(field: :combine, delete_field: true)

input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f"},
  {foo: "c", bar: "d", baz: "g"},
  {foo: "c", bar: "e", baz: "h"},
]
expect(result).to eq(expected)

Gathering examples

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true,
#   example_source_field: :baz,
#   max_examples: 2,
#   example_target_field: :ex,
#   example_delim: " ; "
xform = Deduplicate::Table.new(field: :combine, delete_field: true,
  example_source_field: :baz, max_examples: 2,
  example_target_field: :ex, example_delim: " ; ")

input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f", ex: "f ; f"},
  {foo: "c", bar: "d", baz: "g", ex: "g ; i"},
  {foo: "c", bar: "e", baz: "h", ex: "h"},
]
expect(result).to eq(expected)

Reporting occurrence count

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true,
#   example_source_field: :baz,
#   max_examples: 2,
#   include_occs: true
xform = Deduplicate::Table.new(field: :combine, delete_field: true,
  example_source_field: :baz, max_examples: 2,
  include_occs: true
)
input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f", examples: "f|f", occurrences: 2},
  {foo: "c", bar: "d", baz: "g", examples: "g|i", occurrences: 4},
  {foo: "c", bar: "e", baz: "h", examples: "h", occurrences: 1},
]
expect(result).to eq(expected)

Reporting occurrence count without examples

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true,
#   include_occs: true
xform = Deduplicate::Table.new(field: :combine, delete_field: true,
  include_occs: true
)
input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f", occurrences: 2},
  {foo: "c", bar: "d", baz: "g", occurrences: 4},
  {foo: "c", bar: "e", baz: "h", occurrences: 1},
]
expect(result).to eq(expected)

Compiling unique field values into one field

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true,
#   compile_uniq_fieldvals: true,
#   compile_delim: ", "
xform = Deduplicate::Table.new(field: :combine, delete_field: true,
  compile_uniq_fieldvals: true, compile_delim: ", ")
input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "c", bar: "d", baz: "", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "c", bar: "d", baz: "i", combine: "c d"},
  {foo: "c", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: nil, combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"},
  {foo: "e", bar: "f", baz: nil, combine: "e f"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", bar: "b", baz: "f"},
  {foo: "c", bar: "d", baz: "g, i, j, k"},
  {foo: "c", bar: "e", baz: "h"},
  {foo: "e", bar: "f", baz: ""}
]
expect(result).to eq(expected)

Combining examples, occs, and unique field value compile

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: true,
#   example_source_field: :foo,
#   example_target_field: :ex,
#   max_examples: 4,
#   include_occs: true,
#   occs_target_field: :occs,
#   compile_uniq_fieldvals: true,
#   compile_delim: ", "
xform = Deduplicate::Table.new(
  field: :combine,
  delete_field: true,
  example_source_field: :foo,
  example_target_field: :ex,
  max_examples: 4,
  include_occs: true,
  occs_target_field: :occs,
  compile_uniq_fieldvals: true,
  compile_delim: ", "
)
input = [
  {foo: "a", bar: "b", baz: "f", combine: "a b"},
  {foo: "aa", bar: "b", baz: "f", combine: "a b"},
  {foo: "c", bar: "d", baz: "g", combine: "c d"},
  {foo: "cc", bar: "d", baz: "", combine: "c d"},
  {foo: "c", bar: "e", baz: "h", combine: "c e"},
  {foo: "ccc", bar: "d", baz: "i", combine: "c d"},
  {foo: "cc", bar: "d", baz: "j", combine: "c d"},
  {foo: "c", bar: "d", baz: nil, combine: "c d"},
  {foo: "c", bar: "d", baz: "k", combine: "c d"},
  {foo: "e", bar: "f", baz: nil, combine: "e f"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {occs: 2, ex: "a|aa", bar: "b", baz: "f"},
  {occs: 6, ex: "c|cc|ccc|cc", bar: "d", baz: "g, i, j, k"},
  {occs: 1, ex: "c", bar: "e", baz: "h"},
  {occs: 1, ex: "e", bar: "f", baz: ""}
]
expect(result).to eq(expected)

Compiling unique field values keeping dedupe field

# Used in pipeline as:
# transform Deduplicate::Table,
#   field: :combine,
#   delete_field: false,
#   compile_uniq_fieldvals: true,
#   compile_delim: ", "
xform = Deduplicate::Table.new(field: :combine, delete_field: false,
  compile_uniq_fieldvals: true, compile_delim: ", ")
input = [
  {baz: "f", combine: "a b"},
  {baz: "f", combine: "a b"},
  {baz: "g", combine: "c d"},
  {baz: "", combine: "c d"},
  {baz: "h", combine: "c e"},
  {baz: "i", combine: "c d"},
  {baz: "j", combine: "c d"},
  {baz: nil, combine: "c d"},
  {baz: "k", combine: "c d"},
  {baz: nil, combine: "e f"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {baz: "f", combine: "a b"},
  {baz: "g, i, j, k", combine: "c d"},
  {baz: "h", combine: "c e"},
  {baz: "", combine: "e f"}
]
expect(result).to eq(expected)

Since:

  • 2.2.0

Instance Method Summary collapse

Constructor Details

#initialize(field:, delete_field: false, example_source_field: nil, max_examples: 10, example_target_field: :examples, example_delim: Kiba::Extend.delim, include_occs: false, occs_target_field: :occurrences, compile_uniq_fieldvals: false, compile_delim: Kiba::Extend.delim) ⇒ Table

Returns a new instance of Table.

Parameters:

  • field (Symbol)

    name of field on which to deduplicate

  • delete_field (Boolean) (defaults to: false)

    whether to delete the deduplication field after doing deduplication

  • example_source_field (nil, Symbol) (defaults to: nil)

    field containing values to be compiled as examples

  • max_examples (Integer) (defaults to: 10)

    maximum number of example values to return

  • example_target_field (Symbol) (defaults to: :examples)

    name of field in which to report example values

  • example_delim (String) (defaults to: Kiba::Extend.delim)

    used to join multiple example values

  • include_occs (Boolean) (defaults to: false)

    whether to report number of occurrences of each field value being deduplicated on

  • occs_target_field (Symbol) (defaults to: :occurrences)

    name of field in which to report occurrences

  • compile_uniq_fieldvals (Boolean) (defaults to: false)

    whether to compile all unique values of each field across duplicate row set into the row that is kept. Values of each field are concatenated in order of row occurrence, then deduplicated

  • compile_delim (String) (defaults to: Kiba::Extend.delim)

    used to join compiled unique field values

Since:

  • 2.2.0



286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# File 'lib/kiba/extend/transforms/deduplicate/table.rb', line 286

def initialize(field:, delete_field: false, example_source_field: nil,
  max_examples: 10, example_target_field: :examples,
  example_delim: Kiba::Extend.delim,
  include_occs: false, occs_target_field: :occurrences,
  compile_uniq_fieldvals: false, compile_delim: Kiba::Extend.delim)
  @field = field
  @deduper = {}
  @delete = delete_field
  @example = example_source_field
  @max_examples = max_examples
  @ex_target = example_target_field
  @delim = example_delim
  @occs = include_occs
  @occ_target = occs_target_field
  @compile_uniq_fieldvals = compile_uniq_fieldvals
  @compile_delim = compile_delim
end

Instance Method Details

#closeObject

Since:

  • 2.2.0



316
317
318
319
320
321
322
323
324
325
# File 'lib/kiba/extend/transforms/deduplicate/table.rb', line 316

def close
  deduper.each do |_val, hash|
    row = hash[:row]
    add_example_field(row, hash) if example
    row[occ_target] = hash[:occs] if occs
    row = compiled_row(hash, row) if compile_uniq_fieldvals
    row.delete(field) if delete
    yield row
  end
end

#compiled_row(hash, row) ⇒ Object

Since:

  • 2.2.0



327
328
329
330
331
332
333
334
335
336
337
# File 'lib/kiba/extend/transforms/deduplicate/table.rb', line 327

def compiled_row(hash, row)
  row.map do |fld, val|
    if fld == example
      [fld, nil]
    elsif [field, ex_target, occ_target].include?(fld)
      [fld, val]
    else
      [fld, hash[:fieldvals][fld].join(compile_delim)]
    end
  end.compact.to_h.compact
end

#process(row) ⇒ Object

Parameters:

  • row (Hash{ Symbol => String, nil })

Since:

  • 2.2.0



305
306
307
308
309
310
311
312
313
314
# File 'lib/kiba/extend/transforms/deduplicate/table.rb', line 305

def process(row)
  field_val = row.fetch(field, nil)
  return if field_val.blank?

  get_row(field_val, row)
  get_occ(field_val, row) if occs
  get_example(field_val, row) if example
  compile_values(field_val, row) if compile_uniq_fieldvals
  nil
end