Class: Kiba::Extend::Transforms::Count::UniqueVals

Inherits:
Object
  • Object
show all
Defined in:
lib/kiba/extend/transforms/count/unique_vals.rb

Overview

Note:

This transform runs in memory, so for very large sources, it may take a long time or fail.

Write count of unique values in field to the given target field. Optionally, group the values under another field for counting

Examples:

Ungrouped, case sensitive, do not count blanks

# Used in pipeline as:
# transform Count::UniqueVals,
#   value_field: :foo,
#   target: :fooct
xform = Count::UniqueVals.new(
  value_field: :foo,
  target: :fooct
)
input = [
  {foo: "a"},
  {foo: "A"},
  {foo: "c"},
  {foo: "C"},
  {foo: "c"},
  {foo: "c"},
  {foo: nil},
  {foo: ""}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", fooct: 4},
  {foo: "A", fooct: 4},
  {foo: "c", fooct: 4},
  {foo: "C", fooct: 4},
  {foo: "c", fooct: 4},
  {foo: "c", fooct: 4},
  {foo: nil, fooct: 4},
  {foo: "", fooct: 4}
]
expect(result).to eq(expected)

Ungrouped, case sensitive, count blanks

# Used in pipeline as:
# transform Count::UniqueVals,
#   value_field: :foo,
#   target: :fooct,
#   count_blank: true
xform = Count::UniqueVals.new(
  value_field: :foo,
  target: :fooct,
  count_blank: true
)
input = [
  {foo: "a"},
  {foo: "A"},
  {foo: "c"},
  {foo: "C"},
  {foo: "c"},
  {foo: "c"},
  {foo: nil},
  {foo: ""},
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", fooct: 5},
  {foo: "A", fooct: 5},
  {foo: "c", fooct: 5},
  {foo: "C", fooct: 5},
  {foo: "c", fooct: 5},
  {foo: "c", fooct: 5},
  {foo: nil, fooct: 5},
  {foo: "", fooct: 5}
]
expect(result).to eq(expected)

Ungrouped, case insensitive

# Used in pipeline as:
# transform Count::UniqueVals,
#   value_field: :foo,
#   target: :fooct
#   casesensitive: false
xform = Count::UniqueVals.new(
  value_field: :foo,
  target: :fooct,
  casesensitive: false
)
input = [
  {foo: "a"},
  {foo: "A"},
  {foo: "c"},
  {foo: "C"},
  {foo: "c"},
  {foo: "c"}
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {foo: "a", fooct: 2},
  {foo: "A", fooct: 2},
  {foo: "c", fooct: 2},
  {foo: "C", fooct: 2},
  {foo: "c", fooct: 2},
  {foo: "c", fooct: 2}
]
expect(result).to eq(expected)

Grouped, case sensitive, count blanks

# Used in pipeline as:
# transform Count::UniqueVals,
#   value_field: :accdate,
#   target: :datect,
#   group_field: :accnum,
#   count_blank: true
xform = Count::UniqueVals.new(
  value_field: :accdate,
  target: :datect,
  group_field: :accnum,
  count_blank: true
)
input = [
  {accnum: "A1", accdate: "2025-09-01"},
  {accnum: "A1", accdate: ""},
  {accnum: "A1", accdate: "2025-09-01"},
  {accnum: "A2", accdate: "2025-09-05"},
  {accnum: "A2", accdate: "2025-09-05"},
  {accnum: "A2", accdate: "2025-07-30"},
  {accnum: "A3", accdate: "2025-09-09"},
  {accnum: "A3", accdate: "2025-09-09"},
  {accnum: "A3", accdate: "2025-09-09"},
]
result = Kiba::StreamingRunner.transform_stream(input, xform)
  .map{ |row| row }
expected = [
  {accnum: "A1", accdate: "2025-09-01", datect: 2},
  {accnum: "A1", accdate: "", datect: 2},
  {accnum: "A1", accdate: "2025-09-01", datect: 2},
  {accnum: "A2", accdate: "2025-09-05", datect: 2},
  {accnum: "A2", accdate: "2025-09-05", datect: 2},
  {accnum: "A2", accdate: "2025-07-30", datect: 2},
  {accnum: "A3", accdate: "2025-09-09", datect: 1},
  {accnum: "A3", accdate: "2025-09-09", datect: 1},
  {accnum: "A3", accdate: "2025-09-09", datect: 1},
]
expect(result).to eq(expected)

Instance Method Summary collapse

Constructor Details

#initialize(value_field:, target:, group_field: nil, casesensitive: true, count_blank: false) ⇒ UniqueVals

Returns a new instance of UniqueVals.

Parameters:

  • value_field (Symbol)

    field whose unique values will be counted

  • target (Symbol)

    field into which count will be written

  • group_field (nil, Symbol) (defaults to: nil)

    field under which counts will be grouped

  • casesensitive (Boolean) (defaults to: true)

    whether case matters in identifying duplicates vs. unique values

  • count_blank (Boolean) (defaults to: false)

    whether to count blank values as values



154
155
156
157
158
159
160
161
162
163
# File 'lib/kiba/extend/transforms/count/unique_vals.rb', line 154

def initialize(value_field:, target:, group_field: nil,
  casesensitive: true, count_blank: false)
  @value_field = value_field
  @target = target
  @group_field = group_field
  @casesensitive = casesensitive
  @count_blank = count_blank
  @grouper = {}
  @rows = []
end

Instance Method Details

#closeObject



172
173
174
175
# File 'lib/kiba/extend/transforms/count/unique_vals.rb', line 172

def close
  @ct = grouper.keys.length unless group_field
  rows.each { |row| yield add_count_to(row) }
end

#process(row) ⇒ Object

Parameters:

  • row (Hash{ Symbol => String, nil })


166
167
168
169
170
# File 'lib/kiba/extend/transforms/count/unique_vals.rb', line 166

def process(row)
  extract_for_count(row)
  rows << row
  nil
end