Column Operations¶
Here you will see a detailed overview of all the column operations available in Optimus.
You can access the operation via df.cols
Let’s create a sample dataframe to start working.
# Import Optimus
from optimus import Optimus
# Create Optimus instance
op = Optimus()
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType
df = op.create.df(
[
("words", "str", True),
("num", "int", True),
("animals", "str", True),
("thing", StringType(), True),
("two strings", StringType(), True),
("filter", StringType(), True),
("num 2", "string", True),
("col_array", ArrayType(StringType()), True),
("col_int", ArrayType(IntegerType()), True)
]
,
[
(" I like fish ", 1, "dog", "housé", "cat-car", "a","1",["baby", "sorry"],[1,2,3]),
(" zombies", 2, "cat", "tv", "dog-tv", "b","2",["baby 1", "sorry 1"],[3,4]),
("simpsons cat lady", 2, "frog", "table","eagle-tv-plus","1","3", ["baby 2", "sorry 2"], [5,6,7]),
(None, 3, "eagle", "glass", "lion-pc", "c","4", ["baby 3", "sorry 3"] ,[7,8])
])
To see the dataframe we will use the table()
function, a much better way to see your results,
instead of the built-in `show()
function.
df.table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] |
cols.append(col_name=None, value=None)¶
Appends a column to a Dataframe
df = df.cols.append("new_col_1", 1)
df.table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
from pyspark.sql.functions import *
df.cols.append([
("new_col_2", 2.22),
("new_col_3", lit(3))
]).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 | new_col_2 | new_col_3 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 | 2.22 | 3 |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 | 2.22 | 3 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 | 2.22 | 3 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 | 2.22 | 3 |
df.cols.append([
("new_col_4", "test"),
("new_col_5", df['num']*2),
("new_col_6", [1,2,3])
]).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 | new_col_4 | new_col_5 | new_col_6 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 | test | 2 | [1, 2, 3] |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 | test | 4 | [1, 2, 3] |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 | test | 4 | [1, 2, 3] |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 | test | 6 | [1, 2, 3] |
cols.select(columns=None, regex=None, data_type=None)¶
Select columns using index, column name, regex or data type
columns = ["words", 1, "animals", 3]
df.cols.select(columns).table()
words | num | animals | thing |
I like fish | 1 | dog | housé |
zombies | 2 | cat | tv |
simpsons cat lady | 2 | frog | table |
null | 3 | eagle | glass |
df.cols.select("n.*", regex = True).show()
num | num 2 | new_col_1 |
1 | 1 | 1 |
2 | 2 | 1 |
2 | 3 | 1 |
3 | 4 | 1 |
df.cols.select("*", data_type = "str").table()
thing | words | animals | filter | two strings | num 2 |
housé | I like fish | dog | a | cat-car | 1 |
tv | zombies | cat | b | dog-tv | 2 |
table | simpsons cat lady | frog | 1 | eagle-tv-plus | 3 |
glass | null | eagle | c | lion-pc | 4 |
cols.rename(columns_old_new=None, func=None)¶
Changes the name of a column(s) dataFrame.
df.cols.rename('num','number').table()
words | number | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
df.cols.rename([('num','number'),("animals","gods")], str.upper).table()
WORDS | NUM | ANIMALS | THING | TWO STRINGS | FILTER | NUM 2 | COL_ARRAY | COL_INT | NEW_COL_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
df.cols.rename(str.lower).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
cols.cast()¶
Cast multiple columns to a specific datatype.
List of tuples of column names and types to be casted. This variable should have the following structure:
colsAndTypes = [(‘columnName1’, ‘integer’), (‘columnName2’, ‘float’), (‘columnName3’, ‘string’)]
The first parameter in each tuple is the column name, the second is the final datatype of column after the transformation is made.
df.cols.cast([("num", "string"),("num 2", "integer")]).dtypes
[('words', 'string'),
('num', 'string'),
('animals', 'string'),
('thing', 'string'),
('two strings', 'string'),
('filter', 'string'),
('num 2', 'int'),
('col_array', 'array<string>'),
('col_int', 'array<int>'),
('new_col_1', 'int')]
You can cast all columns to a specific type too.
df.cols.cast("*", "string").dtypes
[('words', 'string'),
('num', 'string'),
('animals', 'string'),
('thing', 'string'),
('two strings', 'string'),
('filter', 'string'),
('num 2', 'string'),
('col_array', 'string'),
('col_int', 'string'),
('new_col_1', 'string')]
cols.keep(columns=None, regex=None)¶
Only keep the columns specified.
df.cols.keep("num").table()
num |
1 |
2 |
2 |
3 |
cols.move(column, position, ref_col)¶
Move a column to specific position
df.cols.move("words", "after", "thing").table()
num | animals | thing | words | two strings | filter | num 2 | col_array | col_int | new_col_1 |
1 | dog | housé | I like fish | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
2 | cat | tv | zombies | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
2 | frog | table | simpsons cat lady | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
3 | eagle | glass | null | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
cols.sort(order=”asc”)¶
Sort dataframes columns asc or desc
df.cols.sort().table()
animals | col_array | col_int | filter | new_col_1 | num | num 2 | thing | two strings | words |
dog | [baby, sorry] | [1, 2, 3] | a | 1 | 1 | 1 | housé | cat-car | I like fish |
cat | [baby 1, sorry 1] | [3, 4] | b | 1 | 2 | 2 | tv | dog-tv | zombies |
frog | [baby 2, sorry 2] | [5, 6, 7] | 1 | 1 | 2 | 3 | table | eagle-tv-plus | simpsons cat lady |
eagle | [baby 3, sorry 3] | [7, 8] | c | 1 | 3 | 4 | glass | lion-pc | null |
df.cols.sort(order = "desc").table()
words | two strings | thing | num 2 | num | new_col_1 | filter | col_int | col_array | animals |
I like fish | cat-car | housé | 1 | 1 | 1 | a | [1, 2, 3] | [baby, sorry] | dog |
zombies | dog-tv | tv | 2 | 2 | 1 | b | [3, 4] | [baby 1, sorry 1] | cat |
simpsons cat lady | eagle-tv-plus | table | 3 | 2 | 1 | 1 | [5, 6, 7] | [baby 2, sorry 2] | frog |
null | lion-pc | glass | 4 | 3 | 1 | c | [7, 8] | [baby 3, sorry 3] | eagle |
cols.drop()¶
Drops a list of columns
df2 = df.cols.drop("num")
df2.table()
words | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
df2 = df.cols.drop(["num","words"])
df2.table()
animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
Chaining¶
The past transformations were done step by step, but this can be achieved by chaining all operations into one line of code, like the cell below. This way is much more efficient and scalable because it uses all optimization issues from the lazy evaluation approach.
df\
.cols.rename([('num','number')])\
.cols.drop(["number","words"])\
.withColumn("new_col_2", lit("spongebob"))\
.cols.append("new_col_1", 1)\
.cols.sort(order= "desc")\
.rows.drop(df["num 2"] == 3)\
.table()
two strings | thing | num 2 | new_col_2 | new_col_1 | filter | col_int | col_array | animals |
cat-car | housé | 1 | spongebob | 1 | a | [1, 2, 3] | [baby, sorry] | dog |
dog-tv | tv | 2 | spongebob | 1 | b | [3, 4] | [baby 1, sorry 1] | cat |
lion-pc | glass | 4 | spongebob | 1 | c | [7, 8] | [baby 3, sorry 3] | eagle |
cols.unnest(columns, mark=None, n=None, index=None)¶
Split array or string in different columns
df.cols.unnest("two strings","-").table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | two strings_0 | two strings_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | cat | car |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | dog | tv |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | eagle | tv |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | lion | pc |
Only getting the first element
df.cols.unnest("two strings","-", index = 1).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | two strings_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | car |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | tv |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | tv |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | pc |
Unnest array of string
df.cols.unnest(["col_array"]).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | col_array_0 | col_array_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | baby | sorry |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | baby 1 | sorry 1 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | baby 2 | sorry 2 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | baby 3 | sorry 3 |
Split in 3 parts
df \
.cols.unnest(["two strings"], n= 3, mark = "-") \
.table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | two strings_0 | two strings_1 | two strings_2 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | cat | car | null |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | dog | tv | null |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | eagle | tv | plus |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | lion | pc | null |
cols.impute(input_cols, output_cols, strategy=”mean”)¶
Imputes missing data from specified columns using the mean or median.
# Create test dataset
df_fill = op.spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")),
(float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)], ["a", "b"])
df_fill.cols.impute(["a", "b"], ["out_a", "out_b"], "median").table()
a | b | out_a | out_b |
1.0 | NaN | 1.0 | 4.0 |
2.0 | NaN | 2.0 | 4.0 |
NaN | 3.0 | 2.0 | 3.0 |
4.0 | 4.0 | 4.0 | 4.0 |
5.0 | 5.0 | 5.0 | 5.0 |
cols.select_by_dtypes(data_type)¶
Returns one or multiple dataframe columns which match with the data type provided.
df.cols.select_by_dtypes("int").table()
num |
1 |
2 |
2 |
3 |
cols.apply_by_dtypes(columns, func, func_return_type, args=None, func_type=None, data_type=None)¶
Apply a function using pandas udf or udf if apache arrow is not available.
In the next example we replace a number in a string column with “new string”:
def func(val, attr):
return attr
df.cols.apply_by_dtypes("filter", func, "string", "new string", data_type="integer").table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | new string | 3 | [baby 2, sorry 2] | [5, 6, 7] |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] |
User Define Functions in Optimus¶
Now we’ll create a UDF function that sum a values (32 in this case) to two columns
df = df.cols.append("new_col_1", 1)
def func(val, attr):
return val + attr
df.cols.apply(["num", "new_col_1"], func, "int", 32 ,"udf").table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 33 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 33 |
zombies | 34 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 33 |
simpsons cat lady | 34 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 33 |
null | 35 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 33 |
Now a we’ll create a Pandas UDF function that sum a values (10 in this case) to two columns
def func(val, attr):
return val + attr
df.cols.apply(["num", "new_col_1"], func, "int", 10).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 11 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 11 |
zombies | 12 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 11 |
simpsons cat lady | 12 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 11 |
null | 13 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 11 |
Create an abstract udf to filter a rows where the value of column “num”> 1
from optimus.functions import abstract_udf as audf
def func(val, attr):
return val>1
df.rows.select(audf("num", func, "boolean")).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
Create an abstract udf (Pandas UDF) to pass two arguments to a function a apply a sum operation
from optimus.functions import abstract_udf as audf
def func(val, attr):
return val+attr[0]+ attr[1]
df.withColumn("num_sum", audf ("num", func, "int", [10,20])).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 | num_sum |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 | 31 |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 | 32 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 | 32 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 | 33 |
cols.apply_expr(columns, func=None, args=None, filter_col_by_dtypes=None, verbose=True)¶
Apply a expression to column.
Here we’ll apply a column expression to when the value of “num” or “num 2” is grater than 2:
from pyspark.sql import functions as F
def func(col_name, attr):
return F.when(F.col(col_name)>2 ,10).otherwise(1)
df.cols.apply_expr(["num","num 2"], func).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | 1 | cat | tv | dog-tv | b | 1 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 1 | frog | table | eagle-tv-plus | 1 | 10 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 10 | eagle | glass | lion-pc | c | 10 | [baby 3, sorry 3] | [7, 8] | 1 |
Convert to uppercase:
from pyspark.sql import functions as F
def func(col_name, attr):
return F.upper(F.col(col_name))
df.cols.apply_expr(["two strings","animals"], func).table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 1 | DOG | housé | CAT-CAR | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | 2 | CAT | tv | DOG-TV | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 2 | FROG | table | EAGLE-TV-PLUS | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 3 | EAGLE | glass | LION-PC | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
cols.count_na(columns)¶
Returns the NAN and Null count in a Column.
import numpy as np
df_null = op.spark.createDataFrame(
[(1, 1, None), (1, 2, float(5)), (1, 3, np.nan), (1, 4, None), (1, 5, float(10)), (1, 6, float('nan')), (1, 6, float('nan'))],
('session', "timestamp1", "id2"))
df_null.cols.count_na("*")
Out -> {'session': 0, 'timestamp1': 0, 'id2': 5}
cols.count_uniques(columns, estimate=True)¶
Returns how many unique items exist in a columns
df.cols.count_uniques("*")
And you’ll get:
{'words': {'approx_count_distinct': 3},
'num': {'approx_count_distinct': 3},
'animals': {'approx_count_distinct': 4},
'thing': {'approx_count_distinct': 4},
'two strings': {'approx_count_distinct': 4},
'filter': {'approx_count_distinct': 4},
'num 2': {'approx_count_distinct': 4},
'col_array': {'approx_count_distinct': 3},
'col_int': {'approx_count_distinct': 4},
'new_col_1': {'approx_count_distinct': 1}}
cols.replace(columns, search_and_replace=None, value=None, regex=None)¶
Replace a value or a list of values by a specified string
df.cols.replace("animals",["dog","cat"],"animals").table()
Replace “dog”,”cat” in column “animals” by the word “animals”:
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 1 | animals | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | 2 | animals | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
Replace “dog-tv”, “cat”, “eagle”, “fish” in columns “two strings”,”animals” by “animals”:
df.cols.replace(["two strings","animals"], ["dog-tv", "cat", "eagle", "fish"], "animals").table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 |
zombies | 2 | animals | tv | animals | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 |
null | 3 | animals | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 |
cols.nest(input_cols, output_col, shape=None, separator=” “)¶
Concat multiple columns to one with the format specified
df.cols.nest(["num", "new_col_1"], output_col = "col_nested", shape ="vector").table()
Merge two columns in a column vector:
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 | col_nested |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 | [1.0,1.0] |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 | [2.0,1.0] |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 | [2.0,1.0] |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 | [3.0,1.0] |
Merge two columns in a string columns:
df.cols.nest(["animals", "two strings"], output_col= "col_nested", shape = "string").table()
words | num | animals | thing | two strings | filter | num 2 | col_array | col_int | new_col_1 | col_nested |
I like fish | 1 | dog | housé | cat-car | a | 1 | [baby, sorry] | [1, 2, 3] | 1 | dog cat-car |
zombies | 2 | cat | tv | dog-tv | b | 2 | [baby 1, sorry 1] | [3, 4] | 1 | cat dog-tv |
simpsons cat lady | 2 | frog | table | eagle-tv-plus | 1 | 3 | [baby 2, sorry 2] | [5, 6, 7] | 1 | frog eagle-tv-plus |
null | 3 | eagle | glass | lion-pc | c | 4 | [baby 3, sorry 3] | [7, 8] | 1 | eagle lion-pc |