From ad97d007d4946511cf466f35c24d20ac9d766375 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 20 Sep 2024 15:45:47 -0400 Subject: [PATCH 1/7] add udf and flexibility docs, add agg --- NEWS.md | 6 + docs/examples/UserGuide/getting_started.jl | 4 +- docs/examples/UserGuide/udfs_ex.jl | 197 +++++++++++++++++++++ docs/mkdocs.yml | 1 + src/TBD_macros.jl | 2 +- src/TidierDB.jl | 4 +- src/parsing_athena.jl | 4 +- src/parsing_clickhouse.jl | 4 +- src/parsing_duckdb.jl | 14 +- src/parsing_gbq.jl | 4 +- src/parsing_mssql.jl | 4 +- src/parsing_mysql.jl | 4 +- src/parsing_oracle.jl | 4 +- src/parsing_postgres.jl | 4 +- src/parsing_snowflake.jl | 4 +- src/parsing_sqlite.jl | 4 +- src/structs.jl | 4 +- 17 files changed, 237 insertions(+), 31 deletions(-) create mode 100644 docs/examples/UserGuide/udfs_ex.jl diff --git a/NEWS.md b/NEWS.md index 419fd3d..ebcaa6c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # TidierDB.jl updates +## v0.3.4 - 2024 +- docs around using UDFs flexibility of TidierDB parsing +- adds `agg()` to allow any built in sql aggregate function to be used in `@mutate` +- `t()` as alias for `from_query()` for convenience +- Bugfix: fixes MsSQL joins + ## v0.3.3 - 2024-08-29 - Bugfix: `@mutate` allows type conversion as part of larger mutate expressions diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl index 95c4526..fef2524 100644 --- a/docs/examples/UserGuide/getting_started.jl +++ b/docs/examples/UserGuide/getting_started.jl @@ -61,6 +61,6 @@ # end # ``` # --- -# Tip: Setting `t(table) = from_query(table)` will save some keystrokes. -# This means after saving the results of `db_table` you can start all chains/refer to the data with `t(table)` +# Tip: `t()` is an alias for `from_query` +# This means after saving the results of `db_table` use `t(table)` refer to the table or prior query # --- \ No newline at end of file diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl new file mode 100644 index 0000000..adbf9c2 --- /dev/null +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -0,0 +1,197 @@ +# TidierDB is unique in its statement parsing flexiblility. This means that using any built in SQL function or user defined functions (or UDFS) or is readily avaialable. +# To use any function built into a database in `@mutate` or in `@summarize`, simply correctly write the correctly, but replace `'` with `"`. This also applies to any UDF. The example below will illustrate UDFs in the context of DuckDB. + + +# ``` +# # Set up the connection +# using TidierDB #rexports DuckDB +# db = DuckDB.DB() +# con = DuckDB.connect(db) # this will be important for UDFs +# mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" +# mtcars = db_tbable(con, mtcars_path); +# ``` +# ## aggregate function in `@summarize` +# Lets use the DuckDB `kurtosis` aggregate function +# ``` +# @chain t(mtcars) begin +# @group_by cyl +# @summarize(kurt = kurtosis(mpg)) +# @collect +# end +# 3×2 DataFrame +# Row │ cyl kurt +# │ Int64? Float64? +# ─────┼─────────────────── +# 1 │ 4 -1.43411 +# 2 │ 6 -1.82944 +# 3 │ 8 0.330061 +# ``` + +# ## aggregate functions in `@mutate` + +# ``` +# @chain t(mtcars) begin +# @group_by(cyl) +# @mutate(kurt = agg("kurtosis(mpg)")) +# @select cyl mpg kurt +# @collect +# end + +# 32×3 DataFrame +# Row │ cyl mpg kurt +# │ Int64? Float64? Float64? +# ─────┼───────────────────────────── +# 1 │ 8 18.7 0.330061 +# 2 │ 8 14.3 0.330061 +# 3 │ 8 16.4 0.330061 +# 4 │ 8 17.3 0.330061 +# 5 │ 8 15.2 0.330061 +# 6 │ 8 10.4 0.330061 +# 7 │ 8 10.4 0.330061 +# ⋮ │ ⋮ ⋮ ⋮ +# 27 │ 6 21.0 -1.82944 +# 28 │ 6 21.4 -1.82944 +# 29 │ 6 18.1 -1.82944 +# 30 │ 6 19.2 -1.82944 +# 31 │ 6 17.8 -1.82944 +# 32 │ 6 19.7 -1.82944 +# 19 rows omitted +# end + +# ``` + + +# ## DuckDB function chaining +# In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this. +# ``` +# @chain t(mtcars) begin +# @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat(".")) +# @select model model2 +# @collect +# end +# 32×2 DataFrame +# Row │ model model2 +# │ String? String? +# ─────┼─────────────────────────────────────── +# 1 │ Mazda RX4 MAZDA.RX4. +# 2 │ Mazda RX4 Wag MAZDA.RX4.WAG. +# 3 │ Datsun 710 DATSUN.710. +# 4 │ Hornet 4 Drive HORNET.4.DRIVE. +# 5 │ Hornet Sportabout HORNET.SPORTABOUT. +# 6 │ Valiant VALIANT. +# 7 │ Duster 360 DUSTER.360. +# ⋮ │ ⋮ ⋮ +# 27 │ Porsche 914-2 PORSCHE.914-2. +# 28 │ Lotus Europa LOTUS.EUROPA. +# 29 │ Ford Pantera L FORD.PANTERA.L. +# 30 │ Ferrari Dino FERRARI.DINO. +# 31 │ Maserati Bora MASERATI.BORA. +# 32 │ Volvo 142E VOLVO.142E. +# 19 rows omitted +# ``` + +# ## `rowid` and pseudocolumns +# When a table is not being read directly from a file, `rowid` is avaialable for use. In general, TidierDB should support all pseudocolumns. +# ``` +# copy_to(db, mtcars_path, "mtcars"); # copying table in for demostration purposes +# @chain db_table(con, :mtcars) begin +# @filter(rowid == 4) +# @select(model:hp) +# @collect +# end +# 1×5 DataFrame +# Row │ model mpg cyl disp hp +# │ String? Float64? Int64? Float64? Int64? +# ─────┼─────────────────────────────────────────────────────── +# 1 │ Hornet Sportabout 18.7 8 360.0 175 +# ``` + +# ## UDFs in `@mutate` +# The UDF drops inplace with no further adjustments. Continue below to learn how to create a UDF in DuckDB.jl +# ``` +# @chain t(mtcars) begin +# @mutate(test = diff_of_squares(cyl, hp)) +# @select(test, cyl, hp) +# @collect +# end +# 32×3 DataFrame +# Row │ test cyl hp +# │ Int64 Int64 Int64 +# ─────┼─────────────────────── +# 1 │ -12064 6 110 +# 2 │ -12064 6 110 +# 3 │ -8633 4 93 +# 4 │ -12064 6 110 +# 5 │ -30561 8 175 +# 6 │ -10989 6 105 +# 7 │ -59961 8 245 +# ⋮ │ ⋮ ⋮ ⋮ +# 27 │ -8265 4 91 +# 28 │ -12753 4 113 +# 29 │ -69632 8 264 +# 30 │ -30589 6 175 +# 31 │ -112161 8 335 +# 32 │ -11865 4 109 +# 19 rows omitted +# ``` + + +# ## How to create UDF in DuckDB.jl +# Once a UDF is regestered in your DuckDB db, you can use it as you would any other SQL function, with no decorators. +# This next section will walk through defining a function, how to register it and finally, how to use it with TidierDB. +# Of note, if other + +# ## Defining a UDF +# First, lets define a function that calculates the difference of squares. +# Input and Output Types: +# - `input::DuckDB.duckdb_data_chunk` is the incoming data chunk (a batch of rows) that DuckDB passes to the function. +# - `output::DuckDB.duckdb_vector` is where the result of the function is written. +# ``` +# function DiffOfSquares(info::DuckDB.duckdb_function_info, input::DuckDB.duckdb_data_chunk, output::DuckDB.duckdb_vector) +# # We first convert the raw input to a DataChunk object using `DuckDB.DataChunk(input, false)` +# input = DuckDB.DataChunk(input, false) +# # Determine how many rows (n) are in the chunk using `DuckDB.get_size(input)`. +# n = DuckDB.get_size(input) +# # We retrieve the first and second input columns with DuckDB.get_vector() +# # And convert them into Julia arrays with DuckDB.get_array. +# a_data = DuckDB.get_array(DuckDB.get_vector(input, 1), Int64, n) +# b_data = DuckDB.get_array(DuckDB.get_vector(input, 2), Int64, n) +# # create an output array output_data corresponding to the output column +# output_data = DuckDB.get_array(DuckDB.Vec(output), Int64, n) +# # loop through each row, perform the desired operation and store the result in output_data[row]. +# for row in 1:n +# output_data[row] = a_data[row]^2 - b_data[row]^2 +# end +# end; +# ``` + +# ## Configure the UDF +# Once the function is defined, the next step is to register it in your DuckDB db. This involves creating a scalar function object, specifying the input/output types, linking the function, and registering it with the database. +# ``` +# # Create scalar function object +# f = DuckDB.duckdb_create_scalar_function() +# DuckDB.duckdb_scalar_function_set_name(f, "diff_of_squares") +# ``` + +# Input parameters are defined with `duckdb_create_logical_type(type)` where type is, for example, `DUCKDB_TYPE_BIGINT` for integers or `DUCKDB_TYPE_VARCHAR` for strings. +# ``` +# # Define input parameters as BIGINT +# type = DuckDB.duckdb_create_logical_type(DuckDB.DUCKDB_TYPE_BIGINT) +# DuckDB.duckdb_table_function_add_parameter(f, type) +# DuckDB.duckdb_table_function_add_parameter(f, type) + +# # Define return type as BIGINT +# DuckDB.duckdb_scalar_function_set_return_type(f, type) +# DuckDB.duckdb_destroy_logical_type(type) +# ``` + +# ## Link and Register the Julia Function +# `@cfunction` is used to convert the Julia function into a callable C function, which DuckDB can invoke. +# ``` +# CDiffOfSquares = @cfunction(DiffOfSquares, Cvoid, (DuckDB.duckdb_function_info, DuckDB.duckdb_data_chunk, DuckDB.duckdb_vector)) + +# # Set the function handler and register +# DuckDB.duckdb_scalar_function_set_function(f, CDiffOfSquares) +# DuckDB.duckdb_register_scalar_function(con.handle, f) +# ``` + diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 75407c2..d6b2b6a 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -126,4 +126,5 @@ nav: - "Writing Functions/Macros with TidierDB Chains" : "examples/generated/UserGuide/functions_pass_to_DB.md" - "Working With Larger than RAM Datasets" : "examples/generated/UserGuide/outofmemex.md" - "TidierDB.jl vs Ibis" : "examples/generated/UserGuide/ibis_comp.md" + - "Flexible Syntax and UDFs" : "examples/generated/UserGuide/udfs_ex.md" - "Reference" : "reference.md" \ No newline at end of file diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index 627dba8..8609f31 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -775,6 +775,6 @@ end """ $docstring_show_tables """ -function show_tables(con::DuckDB.DB) +function show_tables(con::Union{DuckDB.DB, DuckDB.Connection}) return DataFrame(DBInterface.execute(con, "SHOW TABLES")) end diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 611196b..1f12798 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -18,7 +18,7 @@ using GZip @distinct, @left_join, @right_join, @inner_join, @count, @window_order, @window_frame, @show_query, @collect, @slice_max, @slice_min, @slice_sample, @rename, copy_to, duckdb_open, duckdb_connect, @semi_join, @full_join, @anti_join, connect, from_query, @interpolate, add_interp_parameter!, update_con, @head, - clickhouse, duckdb, sqlite, mysql, mssql, postgres, athena, snowflake, gbq, oracle, databricks, SQLQuery, show_tables + clickhouse, duckdb, sqlite, mysql, mssql, postgres, athena, snowflake, gbq, oracle, databricks, SQLQuery, show_tables, t abstract type SQLBackend end @@ -172,7 +172,7 @@ end # DuckDB -function get_table_metadata(conn::DuckDB.DB, table_name::String) +function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_name::String) set_sql_mode(duckdb()); query = """ diff --git a/src/parsing_athena.jl b/src/parsing_athena.jl index ecb81de..e4b98fd 100644 --- a/src/parsing_athena.jl +++ b/src/parsing_athena.jl @@ -82,9 +82,9 @@ function expr_to_sql_trino(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) + #elseif @capture(x, agg(str_)) # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") + # return error("agg is only needed with aggregate functions in @mutate") # else # window_clause = construct_window_clause(sq) # return "$(str) $(window_clause)" diff --git a/src/parsing_clickhouse.jl b/src/parsing_clickhouse.jl index 6739e6b..c7db84e 100644 --- a/src/parsing_clickhouse.jl +++ b/src/parsing_clickhouse.jl @@ -82,9 +82,9 @@ function expr_to_sql_clickhouse(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) + #elseif @capture(x, agg(str_)) # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") + # return error("agg is only needed with aggregate functions in @mutate") # else # window_clause = construct_window_clause(sq) # return "$(str) $(window_clause)" diff --git a/src/parsing_duckdb.jl b/src/parsing_duckdb.jl index a8ce019..c85d872 100644 --- a/src/parsing_duckdb.jl +++ b/src/parsing_duckdb.jl @@ -81,13 +81,13 @@ function expr_to_sql_duckdb(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) - # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REGEXP_REPLACE($str, $pattern, $replace, 'g')) diff --git a/src/parsing_gbq.jl b/src/parsing_gbq.jl index fdeec20..f764342 100644 --- a/src/parsing_gbq.jl +++ b/src/parsing_gbq.jl @@ -111,9 +111,9 @@ function expr_to_sql_gbq(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) + #elseif @capture(x, agg(str_)) # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") + # return error("agg is only needed with aggregate functions in @mutate") # else # window_clause = construct_window_clause(sq) # return "$(str) $(window_clause)" diff --git a/src/parsing_mssql.jl b/src/parsing_mssql.jl index 4f07dd7..ac4720f 100644 --- a/src/parsing_mssql.jl +++ b/src/parsing_mssql.jl @@ -82,9 +82,9 @@ function expr_to_sql_mssql(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) + #elseif @capture(x, agg(str_)) # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") + # return error("agg is only needed with aggregate functions in @mutate") # else # window_clause = construct_window_clause(sq) # return "$(str) $(window_clause)" diff --git a/src/parsing_mysql.jl b/src/parsing_mysql.jl index 8a150fe..80290c0 100644 --- a/src/parsing_mysql.jl +++ b/src/parsing_mysql.jl @@ -82,9 +82,9 @@ function expr_to_sql_mysql(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) + #elseif @capture(x, agg(str_)) # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") + # return error("agg is only needed with aggregate functions in @mutate") # else # window_clause = construct_window_clause(sq) # return "$(str) $(window_clause)" diff --git a/src/parsing_oracle.jl b/src/parsing_oracle.jl index b6c94f2..13f9907 100644 --- a/src/parsing_oracle.jl +++ b/src/parsing_oracle.jl @@ -82,9 +82,9 @@ function expr_to_sql_oracle(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) + #elseif @capture(x, agg(str_)) # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") + # return error("agg is only needed with aggregate functions in @mutate") # else # window_clause = construct_window_clause(sq) # return "$(str) $(window_clause)" diff --git a/src/parsing_postgres.jl b/src/parsing_postgres.jl index 7aeb318..20f4610 100644 --- a/src/parsing_postgres.jl +++ b/src/parsing_postgres.jl @@ -82,9 +82,9 @@ function expr_to_sql_postgres(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) + #elseif @capture(x, agg(str_)) # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") + # return error("agg is only needed with aggregate functions in @mutate") # else # window_clause = construct_window_clause(sq) # return "$(str) $(window_clause)" diff --git a/src/parsing_snowflake.jl b/src/parsing_snowflake.jl index 30a10b6..b68c70f 100644 --- a/src/parsing_snowflake.jl +++ b/src/parsing_snowflake.jl @@ -83,9 +83,9 @@ function expr_to_sql_snowflake(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, sql_agg(str_)) + #elseif @capture(x, agg(str_)) # if from_summarize - # return error("sql_agg is only needed with aggregate functions in @mutate") + # return error("agg is only needed with aggregate functions in @mutate") # else # window_clause = construct_window_clause(sq) # return "$(str) $(window_clause)" diff --git a/src/parsing_sqlite.jl b/src/parsing_sqlite.jl index c6a2af1..f08cb9e 100644 --- a/src/parsing_sqlite.jl +++ b/src/parsing_sqlite.jl @@ -55,9 +55,9 @@ function expr_to_sql_lite(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq, from_cumsum = true) return "SUM($(string(a))) $(window_clause)" end - elseif @capture(x, sql_agg(str_)) + elseif @capture(x, agg(str_)) if from_summarize - return error("sql_agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) return "$(str) $(window_clause)" diff --git a/src/structs.jl b/src/structs.jl index 77e786d..9ba4df3 100644 --- a/src/structs.jl +++ b/src/structs.jl @@ -101,4 +101,6 @@ function from_query(query::TidierDB.SQLQuery) ch_settings = query.ch_settings ) return new_query -end \ No newline at end of file +end + +t(table) = from_query(table) \ No newline at end of file From 2063fda8f91e56ff2e671252e2f557c45677dc5c Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Sun, 22 Sep 2024 10:40:57 -0400 Subject: [PATCH 2/7] fixes window functions, tested against polars --- src/parsing_athena.jl | 23 ++++++++++++++++------- src/parsing_clickhouse.jl | 23 ++++++++++++++++------- src/parsing_duckdb.jl | 9 +++++++++ src/parsing_gbq.jl | 23 ++++++++++++++++------- src/parsing_mssql.jl | 23 ++++++++++++++++------- src/parsing_mysql.jl | 23 ++++++++++++++++------- src/parsing_oracle.jl | 23 ++++++++++++++++------- src/parsing_postgres.jl | 23 ++++++++++++++++------- src/parsing_snowflake.jl | 23 ++++++++++++++++------- 9 files changed, 137 insertions(+), 56 deletions(-) diff --git a/src/parsing_athena.jl b/src/parsing_athena.jl index e4b98fd..7ce7a14 100644 --- a/src/parsing_athena.jl +++ b/src/parsing_athena.jl @@ -82,13 +82,22 @@ function expr_to_sql_trino(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, agg(str_)) - # if from_summarize - # return error("agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REGEXP_REPLACE($str, $pattern, $replace, 'g')) diff --git a/src/parsing_clickhouse.jl b/src/parsing_clickhouse.jl index c7db84e..3026097 100644 --- a/src/parsing_clickhouse.jl +++ b/src/parsing_clickhouse.jl @@ -82,13 +82,22 @@ function expr_to_sql_clickhouse(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, agg(str_)) - # if from_summarize - # return error("agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(replaceRegexpAll($str, $pattern, $replace)) diff --git a/src/parsing_duckdb.jl b/src/parsing_duckdb.jl index c85d872..4cb9973 100644 --- a/src/parsing_duckdb.jl +++ b/src/parsing_duckdb.jl @@ -88,6 +88,15 @@ function expr_to_sql_duckdb(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "$(str) $(window_clause)" end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REGEXP_REPLACE($str, $pattern, $replace, 'g')) diff --git a/src/parsing_gbq.jl b/src/parsing_gbq.jl index f764342..d10a1de 100644 --- a/src/parsing_gbq.jl +++ b/src/parsing_gbq.jl @@ -111,13 +111,22 @@ function expr_to_sql_gbq(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, agg(str_)) - # if from_summarize - # return error("agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REGEXP_REPLACE($str, $pattern, $replace, 'g')) diff --git a/src/parsing_mssql.jl b/src/parsing_mssql.jl index ac4720f..8b3d76b 100644 --- a/src/parsing_mssql.jl +++ b/src/parsing_mssql.jl @@ -82,13 +82,22 @@ function expr_to_sql_mssql(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, agg(str_)) - # if from_summarize - # return error("agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REPLACE($str, $pattern, $replace)) diff --git a/src/parsing_mysql.jl b/src/parsing_mysql.jl index 80290c0..7550d48 100644 --- a/src/parsing_mysql.jl +++ b/src/parsing_mysql.jl @@ -82,13 +82,22 @@ function expr_to_sql_mysql(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, agg(str_)) - # if from_summarize - # return error("agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REGEXP_REPLACE($str, $pattern, $replace)) diff --git a/src/parsing_oracle.jl b/src/parsing_oracle.jl index 13f9907..b0c0faf 100644 --- a/src/parsing_oracle.jl +++ b/src/parsing_oracle.jl @@ -82,13 +82,22 @@ function expr_to_sql_oracle(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, agg(str_)) - # if from_summarize - # return error("agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REPLACE($str, $pattern, $replace)) diff --git a/src/parsing_postgres.jl b/src/parsing_postgres.jl index 20f4610..e1a3c09 100644 --- a/src/parsing_postgres.jl +++ b/src/parsing_postgres.jl @@ -82,13 +82,22 @@ function expr_to_sql_postgres(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, agg(str_)) - # if from_summarize - # return error("agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REGEXP_REPLACE($str, $pattern, $replace, 'g')) diff --git a/src/parsing_snowflake.jl b/src/parsing_snowflake.jl index b68c70f..db922c0 100644 --- a/src/parsing_snowflake.jl +++ b/src/parsing_snowflake.jl @@ -83,13 +83,22 @@ function expr_to_sql_snowflake(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - #elseif @capture(x, agg(str_)) - # if from_summarize - # return error("agg is only needed with aggregate functions in @mutate") - # else - # window_clause = construct_window_clause(sq) - # return "$(str) $(window_clause)" - # end + elseif @capture(x, agg(str_)) + if from_summarize + return error("agg is only needed with aggregate functions in @mutate") + else + window_clause = construct_window_clause(sq) + return "$(str) $(window_clause)" + end + elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call + function_name = x.args[1] # This will be `lead` + args = x.args[2:end] # Capture all arguments from the second position onward + window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "$(function_name)($(arg_str))" # Construct the function call string + return "$(str) $(window_clause)" #stringr functions, have to use function that removes _ so capture can capture name elseif @capture(x, strreplaceall(str_, pattern_, replace_)) return :(REGEXP_REPLACE($str, $pattern, $replace, 'g')) From 79aafcc576ddb5a87b3ae51a122f65b864fd2455 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Sun, 22 Sep 2024 14:51:29 -0400 Subject: [PATCH 3/7] `agg` no longer requires a string and works with `across` in `@mutate` --- docs/examples/UserGuide/udfs_ex.jl | 4 +- src/db_parsing.jl | 60 +++++++++++++++++++++++++----- src/parsing_athena.jl | 8 +++- src/parsing_clickhouse.jl | 8 +++- src/parsing_duckdb.jl | 8 +++- src/parsing_gbq.jl | 8 +++- src/parsing_mssql.jl | 8 +++- src/parsing_mysql.jl | 8 +++- src/parsing_oracle.jl | 8 +++- src/parsing_postgres.jl | 8 +++- src/parsing_snowflake.jl | 8 +++- src/parsing_sqlite.jl | 8 +++- 12 files changed, 112 insertions(+), 32 deletions(-) diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index adbf9c2..729f6cf 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -28,11 +28,11 @@ # ``` # ## aggregate functions in `@mutate` - +# To aggregate sql functions that are builtin to any database, but exist outside of the TidierDB parser, simply wrap the function call in `agg()` # ``` # @chain t(mtcars) begin # @group_by(cyl) -# @mutate(kurt = agg("kurtosis(mpg)")) +# @mutate(kurt = agg(kurtosis(mpg))) # @select cyl mpg kurt # @collect # end diff --git a/src/db_parsing.jl b/src/db_parsing.jl index 1e913ab..b10ea95 100644 --- a/src/db_parsing.jl +++ b/src/db_parsing.jl @@ -244,14 +244,13 @@ end function parse_across(expr, metadata) columns_expr, funcs_expr = expr.args[2], expr.args[3] + + # Existing column selection logic remains unchanged if isa(columns_expr, String) - # Split the string on commas and trim any whitespace around the names - columns_exprs = map(Symbol, split(strip(columns_expr), ", ")) + columns_exprs = map(Symbol, split(strip(columns_expr), ",")) elseif isa(columns_expr, Expr) && columns_expr.head == :tuple - # If columns_expr is a tuple expression, extract its arguments columns_exprs = columns_expr.args else - # Handle single columns or other expressions by wrapping in an array columns_exprs = [columns_expr] end @@ -261,13 +260,12 @@ function parse_across(expr, metadata) for func in funcs for col_name in resolved_columns - func_name = isa(func, Symbol) ? func : func.args[1] - result_name = Symbol(string(func_name), "_", col_name) - - # Ensure column names are treated as symbols (identifiers) col_symbol = Meta.parse(col_name) # Convert string back to symbol - - new_expr = :($result_name = $func_name($col_symbol)) + func_filled = insert_col_into_func(func, col_symbol) + # Specify "agg" to be skipped in the result name + func_name_str = generate_func_name(func, ["agg"]) + result_name = Symbol(func_name_str, "_", col_name) + new_expr = Expr(:(=), result_name, func_filled) push!(result_exprs, new_expr) end end @@ -276,6 +274,48 @@ function parse_across(expr, metadata) return combined_expr end +function insert_col_into_func(func_expr, col_symbol) + if isa(func_expr, Symbol) + # Simple function name; create a call with the column symbol + return Expr(:call, func_expr, col_symbol) + elseif isa(func_expr, Expr) && func_expr.head == :call + # Function call; recursively insert the column symbol into arguments + func_name = func_expr.args[1] + args = func_expr.args[2:end] + new_args = [insert_col_into_func(arg, col_symbol) for arg in args] + return Expr(:call, func_name, new_args...) + else + # Other expressions; return as-is + return func_expr + end +end +function generate_func_name(func_expr, skip_funcs=String[]) + if isa(func_expr, Symbol) + return string(func_expr) + elseif isa(func_expr, Expr) && func_expr.head == :call + func_name_expr = func_expr.args[1] + if isa(func_name_expr, Symbol) + func_name = string(func_name_expr) + else + func_name = generate_func_name(func_name_expr, skip_funcs) + end + # Process nested function names + nested_names = [generate_func_name(arg, skip_funcs) for arg in func_expr.args[2:end]] + # Exclude function names in skip_funcs + if func_name in skip_funcs + # Skip adding this function name + return join(nested_names, "_") + else + # Remove empty strings from nested_names + nested_names = filter(n -> n != "", nested_names) + return join([func_name; nested_names], "_") + end + else + return "" + end +end + + function parse_interpolation2(expr) MacroTools.postwalk(expr) do x if @capture(x, !!variable_Symbol) diff --git a/src/parsing_athena.jl b/src/parsing_athena.jl index 7ce7a14..f62f1e1 100644 --- a/src/parsing_athena.jl +++ b/src/parsing_athena.jl @@ -82,11 +82,15 @@ function expr_to_sql_trino(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_clickhouse.jl b/src/parsing_clickhouse.jl index 3026097..ecd492b 100644 --- a/src/parsing_clickhouse.jl +++ b/src/parsing_clickhouse.jl @@ -82,11 +82,15 @@ function expr_to_sql_clickhouse(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_duckdb.jl b/src/parsing_duckdb.jl index 4cb9973..2b4c4c5 100644 --- a/src/parsing_duckdb.jl +++ b/src/parsing_duckdb.jl @@ -81,11 +81,15 @@ function expr_to_sql_duckdb(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_gbq.jl b/src/parsing_gbq.jl index d10a1de..e990596 100644 --- a/src/parsing_gbq.jl +++ b/src/parsing_gbq.jl @@ -111,11 +111,15 @@ function expr_to_sql_gbq(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_mssql.jl b/src/parsing_mssql.jl index 8b3d76b..4f01641 100644 --- a/src/parsing_mssql.jl +++ b/src/parsing_mssql.jl @@ -82,11 +82,15 @@ function expr_to_sql_mssql(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_mysql.jl b/src/parsing_mysql.jl index 7550d48..3ed9989 100644 --- a/src/parsing_mysql.jl +++ b/src/parsing_mysql.jl @@ -82,11 +82,15 @@ function expr_to_sql_mysql(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_oracle.jl b/src/parsing_oracle.jl index b0c0faf..140f865 100644 --- a/src/parsing_oracle.jl +++ b/src/parsing_oracle.jl @@ -82,11 +82,15 @@ function expr_to_sql_oracle(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif @capture(x, Expr(:call, :agg, args...)) if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + + # Create the SQL string representation of the aggregate function call + arg_str = join(map(string, args), ", ") # Join arguments into a string + str = "agg($(arg_str))" # Construct the function call string return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_postgres.jl b/src/parsing_postgres.jl index e1a3c09..b7b94a6 100644 --- a/src/parsing_postgres.jl +++ b/src/parsing_postgres.jl @@ -82,11 +82,15 @@ function expr_to_sql_postgres(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_snowflake.jl b/src/parsing_snowflake.jl index db922c0..a24f150 100644 --- a/src/parsing_snowflake.jl +++ b/src/parsing_snowflake.jl @@ -83,11 +83,15 @@ function expr_to_sql_snowflake(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq) return "VAR_SAMP($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end elseif !isempty(sq.window_order) && isa(x, Expr) && x.head == :call diff --git a/src/parsing_sqlite.jl b/src/parsing_sqlite.jl index f08cb9e..3ba1790 100644 --- a/src/parsing_sqlite.jl +++ b/src/parsing_sqlite.jl @@ -55,11 +55,15 @@ function expr_to_sql_lite(expr, sq; from_summarize::Bool) window_clause = construct_window_clause(sq, from_cumsum = true) return "SUM($(string(a))) $(window_clause)" end - elseif @capture(x, agg(str_)) + elseif isa(x, Expr) && x.head == :call && x.args[1] == :agg + args = x.args[2:end] # Capture all arguments to agg if from_summarize - return error("agg is only needed with aggregate functions in @mutate") + return error("agg is only needed with aggregate functions in @mutate") else window_clause = construct_window_clause(sq) + # Create the SQL string representation of the agg function call + arg_str = join(map(string, args), ", ") + str = "$(arg_str)" return "$(str) $(window_clause)" end # exc_capture_bug used above to allow proper _ function name capturing From d77c947f7258b21fb8e6d2d96e7007dfc92ef9c1 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Sun, 22 Sep 2024 16:52:34 -0400 Subject: [PATCH 4/7] adds docstrings for `t` and `from_query` --- NEWS.md | 11 ++++-- src/docstrings.jl | 89 +++++++++++++++++++++++++++++++++++++++++++++++ src/structs.jl | 3 ++ 3 files changed, 100 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index ebcaa6c..3d7589e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,15 @@ # TidierDB.jl updates ## v0.3.4 - 2024 -- docs around using UDFs flexibility of TidierDB parsing -- adds `agg()` to allow any built in sql aggregate function to be used in `@mutate` -- `t()` as alias for `from_query()` for convenience +- adds docs around using UDFs flexibility of TidierDB parsing +- Adds `agg()` to use any aggregate built into a database to be used in `@mutate`. (`@summarize` continues to support all without `agg()`) +- Adds `t(query)` as a more efficient way to reference tables. +``` +table = db_table(db, "name") +@chain t(table) ... +``` - Bugfix: fixes MsSQL joins +- Fixes windowing ## v0.3.3 - 2024-08-29 - Bugfix: `@mutate` allows type conversion as part of larger mutate expressions diff --git a/src/docstrings.jl b/src/docstrings.jl index 8ca088a..aed3187 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -1185,4 +1185,93 @@ julia> show_tables(db) # there are no tables in when first loading so df below i │ String ─────┴──────── ``` +""" + + +const docstring_from_query = +""" + from_query(query) + +This is an alias for `t()`. Refer to SQL query without changing the underlying struct. This is an alternate and convenient way to refer to an exisiting DB table + +# Arguments +- `query`: The SQL query to reference + +# Examples +```julia + +julia> df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], + groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], + value = repeat(1:5, 2), + percent = 0.1:0.1:1.0); + +julia> db = connect(duckdb()); + +julia> copy_to(db, df, "df_mem"); + +julia> df_mem = db_table(db, "df_mem"); + +julia> from_query(df_mem) +SQLQuery("", "df_mem", "", "", "", "", "", "", false, false, 4×4 DataFrame + Row │ name type current_selxn table_name + │ String? String? Int64 String +─────┼───────────────────────────────────────────── + 1 │ id VARCHAR 1 df_mem + 2 │ groups VARCHAR 1 df_mem + 3 │ value BIGINT 1 df_mem + 4 │ percent DOUBLE 1 df_mem, false, DuckDB.DB(":memory:"), TidierDB.CTE[], 0, nothing, "", "") +``` +""" + + +const docstring_t = +""" + t(query) + +Alias for from_query. Refer to SQL query without changing the underlying struct. This is an alternate and convenient way to refer to an exisiting DB table + +# Arguments +- `query`: The SQL query to reference + +# Examples +```julia + +julia> df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], + groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], + value = repeat(1:5, 2), + percent = 0.1:0.1:1.0); + +julia> db = connect(duckdb()); + +julia> copy_to(db, df, "df_mem"); + +julia> df_mem = db_table(db, "df_mem"); + + +julia> @chain t(df_mem) @collect +10×4 DataFrame + Row │ id groups value percent + │ String? String? Int64? Float64? +─────┼──────────────────────────────────── + 1 │ AA bb 1 0.1 + 2 │ AB aa 2 0.2 + 3 │ AC bb 3 0.3 + 4 │ AD aa 4 0.4 + 5 │ AE bb 5 0.5 + 6 │ AF aa 1 0.6 + 7 │ AG bb 2 0.7 + 8 │ AH aa 3 0.8 + 9 │ AI bb 4 0.9 + 10 │ AJ aa 5 1.0 + +julia> query_part = @chain t(df_mem) @select groups:percent; + +julia> @chain t(query_part) @filter(value == 4) @collect +2×3 DataFrame + Row │ groups value percent + │ String? Int64? Float64? +─────┼─────────────────────────── + 1 │ aa 4 0.4 + 2 │ bb 4 0.9 +``` """ \ No newline at end of file diff --git a/src/structs.jl b/src/structs.jl index 9ba4df3..256f5be 100644 --- a/src/structs.jl +++ b/src/structs.jl @@ -103,4 +103,7 @@ function from_query(query::TidierDB.SQLQuery) return new_query end +""" +$docstring_t +""" t(table) = from_query(table) \ No newline at end of file From 3239ba776a338ccf722b6d725c645bc5839ba10e Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Sun, 22 Sep 2024 19:48:15 -0400 Subject: [PATCH 5/7] switch to sqlite UDF ex, update news --- NEWS.md | 12 +-- docs/examples/UserGuide/udfs_ex.jl | 125 +++++++++-------------------- 2 files changed, 43 insertions(+), 94 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3d7589e..51723d8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,15 +1,17 @@ # TidierDB.jl updates ## v0.3.4 - 2024 -- adds docs around using UDFs flexibility of TidierDB parsing -- Adds `agg()` to use any aggregate built into a database to be used in `@mutate`. (`@summarize` continues to support all without `agg()`) -- Adds `t(query)` as a more efficient way to reference tables. +TidierDB works with any exisiting SQL function +- Docs on using any exisiting SQL function in TidierDB +- Docs on user defined functions (UDFs) in TidierDB +- Adds `agg()` to use any aggregate built into a database to be used in `@mutate`. (`@summarize` continues to all aggregate SQL functions without `agg()`) +- Adds `t(query)` as a more efficient alternative to reference tables. ``` table = db_table(db, "name") -@chain t(table) ... +@chain t(table) ... ``` - Bugfix: fixes MsSQL joins -- Fixes windowing +- Bugfix: window functions ## v0.3.3 - 2024-08-29 - Bugfix: `@mutate` allows type conversion as part of larger mutate expressions diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index 729f6cf..71ac807 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -60,7 +60,6 @@ # ``` - # ## DuckDB function chaining # In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this. # ``` @@ -106,92 +105,40 @@ # 1 │ Hornet Sportabout 18.7 8 360.0 175 # ``` -# ## UDFs in `@mutate` -# The UDF drops inplace with no further adjustments. Continue below to learn how to create a UDF in DuckDB.jl -# ``` -# @chain t(mtcars) begin -# @mutate(test = diff_of_squares(cyl, hp)) -# @select(test, cyl, hp) -# @collect +# ## UDF SQLite Example +# ``` +# using SQLite +# sql = connect(sqlite()); +# df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], +# groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], +# value = repeat(1:5, 2), +# percent = 0.1:0.1:1.0); +# +# copy_to(db, sql, "df_mem"); +# SQLite.@register sql function diff_of_squares(x, y) +# x^2 - y^2 +# end; +# +# @chain db_table(sql, "df_mem") begin +# @select(value, percent) +# @mutate(plus3 = diff_of_squares(value, percent)) +# @collect # end -# 32×3 DataFrame -# Row │ test cyl hp -# │ Int64 Int64 Int64 -# ─────┼─────────────────────── -# 1 │ -12064 6 110 -# 2 │ -12064 6 110 -# 3 │ -8633 4 93 -# 4 │ -12064 6 110 -# 5 │ -30561 8 175 -# 6 │ -10989 6 105 -# 7 │ -59961 8 245 -# ⋮ │ ⋮ ⋮ ⋮ -# 27 │ -8265 4 91 -# 28 │ -12753 4 113 -# 29 │ -69632 8 264 -# 30 │ -30589 6 175 -# 31 │ -112161 8 335 -# 32 │ -11865 4 109 -# 19 rows omitted -# ``` - - -# ## How to create UDF in DuckDB.jl -# Once a UDF is regestered in your DuckDB db, you can use it as you would any other SQL function, with no decorators. -# This next section will walk through defining a function, how to register it and finally, how to use it with TidierDB. -# Of note, if other - -# ## Defining a UDF -# First, lets define a function that calculates the difference of squares. -# Input and Output Types: -# - `input::DuckDB.duckdb_data_chunk` is the incoming data chunk (a batch of rows) that DuckDB passes to the function. -# - `output::DuckDB.duckdb_vector` is where the result of the function is written. -# ``` -# function DiffOfSquares(info::DuckDB.duckdb_function_info, input::DuckDB.duckdb_data_chunk, output::DuckDB.duckdb_vector) -# # We first convert the raw input to a DataChunk object using `DuckDB.DataChunk(input, false)` -# input = DuckDB.DataChunk(input, false) -# # Determine how many rows (n) are in the chunk using `DuckDB.get_size(input)`. -# n = DuckDB.get_size(input) -# # We retrieve the first and second input columns with DuckDB.get_vector() -# # And convert them into Julia arrays with DuckDB.get_array. -# a_data = DuckDB.get_array(DuckDB.get_vector(input, 1), Int64, n) -# b_data = DuckDB.get_array(DuckDB.get_vector(input, 2), Int64, n) -# # create an output array output_data corresponding to the output column -# output_data = DuckDB.get_array(DuckDB.Vec(output), Int64, n) -# # loop through each row, perform the desired operation and store the result in output_data[row]. -# for row in 1:n -# output_data[row] = a_data[row]^2 - b_data[row]^2 -# end -# end; -# ``` - -# ## Configure the UDF -# Once the function is defined, the next step is to register it in your DuckDB db. This involves creating a scalar function object, specifying the input/output types, linking the function, and registering it with the database. -# ``` -# # Create scalar function object -# f = DuckDB.duckdb_create_scalar_function() -# DuckDB.duckdb_scalar_function_set_name(f, "diff_of_squares") -# ``` - -# Input parameters are defined with `duckdb_create_logical_type(type)` where type is, for example, `DUCKDB_TYPE_BIGINT` for integers or `DUCKDB_TYPE_VARCHAR` for strings. -# ``` -# # Define input parameters as BIGINT -# type = DuckDB.duckdb_create_logical_type(DuckDB.DUCKDB_TYPE_BIGINT) -# DuckDB.duckdb_table_function_add_parameter(f, type) -# DuckDB.duckdb_table_function_add_parameter(f, type) - -# # Define return type as BIGINT -# DuckDB.duckdb_scalar_function_set_return_type(f, type) -# DuckDB.duckdb_destroy_logical_type(type) -# ``` - -# ## Link and Register the Julia Function -# `@cfunction` is used to convert the Julia function into a callable C function, which DuckDB can invoke. -# ``` -# CDiffOfSquares = @cfunction(DiffOfSquares, Cvoid, (DuckDB.duckdb_function_info, DuckDB.duckdb_data_chunk, DuckDB.duckdb_vector)) - -# # Set the function handler and register -# DuckDB.duckdb_scalar_function_set_function(f, CDiffOfSquares) -# DuckDB.duckdb_register_scalar_function(con.handle, f) -# ``` - +# 10×3 DataFrame +# Row │ value percent plus3 +# │ Int64 Float64 Float64 +# ─────┼───────────────────────── +# 1 │ 1 0.1 0.99 +# 2 │ 2 0.2 3.96 +# 3 │ 3 0.3 8.91 +# 4 │ 4 0.4 15.84 +# 5 │ 5 0.5 24.75 +# 6 │ 1 0.6 0.64 +# 7 │ 2 0.7 3.51 +# 8 │ 3 0.8 8.36 +# 9 │ 4 0.9 15.19 +# 10 │ 5 1.0 24.0 +# ``` + +# ## How to create UDF in DuckDB +# Example coming soon.. \ No newline at end of file From 375e0ea429e51af9429b606c43eca470ea6760e5 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Sun, 22 Sep 2024 20:35:14 -0400 Subject: [PATCH 6/7] fix docstrings --- src/docstrings.jl | 482 ++++++++++++++++++++++------------------------ src/structs.jl | 3 - 2 files changed, 228 insertions(+), 257 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index aed3187..557c31d 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -25,38 +25,38 @@ julia> @chain db_table(db, :df_mem) begin @collect end 10×3 DataFrame - Row │ groups value percent - │ String? Int64? Float64? -─────┼─────────────────────────── - 1 │ bb 1 0.1 - 2 │ aa 2 0.2 - 3 │ bb 3 0.3 - 4 │ aa 4 0.4 - 5 │ bb 5 0.5 - 6 │ aa 1 0.6 - 7 │ bb 2 0.7 - 8 │ aa 3 0.8 - 9 │ bb 4 0.9 - 10 │ aa 5 1.0 + Row │ groups value percent + │ String Int64 Float64 +─────┼──────────────────────── + 1 │ bb 1 0.1 + 2 │ aa 2 0.2 + 3 │ bb 3 0.3 + 4 │ aa 4 0.4 + 5 │ bb 5 0.5 + 6 │ aa 1 0.6 + 7 │ bb 2 0.7 + 8 │ aa 3 0.8 + 9 │ bb 4 0.9 + 10 │ aa 5 1.0 julia> @chain db_table(db, :df_mem) begin @select(contains("e")) @collect end 10×2 DataFrame - Row │ value percent - │ Int64? Float64? -─────┼────────────────── - 1 │ 1 0.1 - 2 │ 2 0.2 - 3 │ 3 0.3 - 4 │ 4 0.4 - 5 │ 5 0.5 - 6 │ 1 0.6 - 7 │ 2 0.7 - 8 │ 3 0.8 - 9 │ 4 0.9 - 10 │ 5 1.0 + Row │ value percent + │ Int64 Float64 +─────┼──────────────── + 1 │ 1 0.1 + 2 │ 2 0.2 + 3 │ 3 0.3 + 4 │ 4 0.4 + 5 │ 5 0.5 + 6 │ 1 0.6 + 7 │ 2 0.7 + 8 │ 3 0.8 + 9 │ 4 0.9 + 10 │ 5 1.0 ``` """ @@ -90,14 +90,14 @@ julia> @chain db_table(db, :df_mem) begin @collect end 5×4 DataFrame - Row │ id groups value percent - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AF aa 1 0.6 - 2 │ AG bb 2 0.7 - 3 │ AH aa 3 0.8 - 4 │ AI bb 4 0.9 - 5 │ AJ aa 5 1.0 + Row │ id groups value percent + │ String String Int64 Float64 +─────┼──────────────────────────────── + 1 │ AF aa 1 0.6 + 2 │ AG bb 2 0.7 + 3 │ AH aa 3 0.8 + 4 │ AI bb 4 0.9 + 5 │ AJ aa 5 1.0 julia> @chain db_table(db, :df_mem) begin @group_by(groups) @@ -110,11 +110,11 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×2 DataFrame - Row │ groups mean - │ String? Float64? -─────┼─────────────────── - 1 │ aa 0.6 - 2 │ bb 0.5 + Row │ groups mean + │ String Float64 +─────┼───────────────── + 1 │ aa 0.6 + 2 │ bb 0.5 ``` """ @@ -145,9 +145,9 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×1 DataFrame - Row │ groups - │ String? -─────┼───────── + Row │ groups + │ String +─────┼──────── 1 │ aa 2 │ bb ``` @@ -178,19 +178,19 @@ julia> @chain db_table(db, :df_mem) begin @collect end 10×5 DataFrame - Row │ id groups value percent new_col - │ String? String? Int64? Float64? Float64? -─────┼────────────────────────────────────────────── - 1 │ AA bb 4 0.1 0.01 - 2 │ AB aa 8 0.2 0.04 - 3 │ AC bb 12 0.3 0.09 - 4 │ AD aa 16 0.4 0.16 - 5 │ AE bb 20 0.5 0.25 - 6 │ AF aa 4 0.6 0.36 - 7 │ AG bb 8 0.7 0.49 - 8 │ AH aa 12 0.8 0.64 - 9 │ AI bb 16 0.9 0.81 - 10 │ AJ aa 20 1.0 1.0 + Row │ id groups value percent new_col + │ String String Int64 Float64 Float64 +─────┼───────────────────────────────────────── + 1 │ AA bb 4 0.1 0.01 + 2 │ AB aa 8 0.2 0.04 + 3 │ AC bb 12 0.3 0.09 + 4 │ AD aa 16 0.4 0.16 + 5 │ AE bb 20 0.5 0.25 + 6 │ AF aa 4 0.6 0.36 + 7 │ AG bb 8 0.7 0.49 + 8 │ AH aa 12 0.8 0.64 + 9 │ AI bb 16 0.9 0.81 + 10 │ AJ aa 20 1.0 1.0 ``` """ @@ -221,11 +221,11 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×5 DataFrame - Row │ groups mean_value mean_percent sum_value sum_percent - │ String? Float64? Float64? Int128? Float64? -─────┼─────────────────────────────────────────────────────────── - 1 │ aa 3.0 0.6 15 3.0 - 2 │ bb 3.0 0.5 15 2.5 + Row │ groups mean_value mean_percent sum_value sum_percent + │ String Float64 Float64 Int128 Float64 +─────┼────────────────────────────────────────────────────────── + 1 │ aa 3.0 0.6 15 3.0 + 2 │ bb 3.0 0.5 15 2.5 julia> @chain db_table(db, :df_mem) begin @group_by(groups) @@ -234,11 +234,11 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×3 DataFrame - Row │ groups test n - │ String? Float64? Int64? -─────┼─────────────────────────── - 1 │ aa 3.0 5 - 2 │ bb 2.5 5 + Row │ groups test n + │ String Float64 Int64 +─────┼──────────────────────── + 1 │ aa 3.0 5 + 2 │ bb 2.5 5 ``` """ const docstring_summarise = @@ -268,11 +268,11 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×5 DataFrame - Row │ groups mean_value mean_percent sum_value sum_percent - │ String? Float64? Float64? Int128? Float64? -─────┼─────────────────────────────────────────────────────────── - 1 │ aa 3.0 0.6 15 3.0 - 2 │ bb 3.0 0.5 15 2.5 + Row │ groups mean_value mean_percent sum_value sum_percent + │ String Float64 Float64 Int128 Float64 +─────┼────────────────────────────────────────────────────────── + 1 │ aa 3.0 0.6 15 3.0 + 2 │ bb 3.0 0.5 15 2.5 julia> @chain db_table(db, :df_mem) begin @group_by(groups) @@ -281,11 +281,11 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×3 DataFrame - Row │ groups test n - │ String? Float64? Int64? -─────┼─────────────────────────── - 1 │ aa 3.0 5 - 2 │ bb 2.5 5 + Row │ groups test n + │ String Float64 Int64 +─────┼──────────────────────── + 1 │ aa 3.0 5 + 2 │ bb 2.5 5 ``` """ @@ -322,11 +322,11 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×5 DataFrame - Row │ id groups value percent rank_col - │ String? String? Int64? Float64? Int64? -─────┼────────────────────────────────────────────── - 1 │ AA bb 1 0.1 1 - 2 │ AF aa 1 0.6 1 + Row │ id groups value percent rank_col + │ String String Int64 Float64 Int64 +─────┼────────────────────────────────────────── + 1 │ AA bb 1 0.1 1 + 2 │ AF aa 1 0.6 1 ``` """ @@ -363,11 +363,11 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×5 DataFrame - Row │ id groups value percent rank_col - │ String? String? Int64? Float64? Int64? -─────┼────────────────────────────────────────────── - 1 │ AE bb 5 0.5 1 - 2 │ AJ aa 5 1.0 1 + Row │ id groups value percent rank_col + │ String String Int64 Float64 Int64 +─────┼────────────────────────────────────────── + 1 │ AE bb 5 0.5 1 + 2 │ AJ aa 5 1.0 1 ``` """ @@ -430,19 +430,19 @@ julia> @chain db_table(db, :df_mem) begin @collect end 10×4 DataFrame - Row │ id groups value percent - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AF aa 1 0.6 - 2 │ AA bb 1 0.1 - 3 │ AG bb 2 0.7 - 4 │ AB aa 2 0.2 - 5 │ AH aa 3 0.8 - 6 │ AC bb 3 0.3 - 7 │ AI bb 4 0.9 - 8 │ AD aa 4 0.4 - 9 │ AJ aa 5 1.0 - 10 │ AE bb 5 0.5 + Row │ id groups value percent + │ String String Int64 Float64 +─────┼──────────────────────────────── + 1 │ AF aa 1 0.6 + 2 │ AA bb 1 0.1 + 3 │ AG bb 2 0.7 + 4 │ AB aa 2 0.2 + 5 │ AH aa 3 0.8 + 6 │ AC bb 3 0.3 + 7 │ AI bb 4 0.9 + 8 │ AD aa 4 0.4 + 9 │ AJ aa 5 1.0 + 10 │ AE bb 5 0.5 ``` """ @@ -473,11 +473,11 @@ julia> @chain db_table(db, :df_mem) begin @collect end 2×2 DataFrame - Row │ groups count - │ String? Int64? -─────┼───────────────── - 1 │ aa 5 - 2 │ bb 5 + Row │ groups count + │ String Int64 +─────┼─────────────── + 1 │ aa 5 + 2 │ bb 5 ``` """ @@ -509,14 +509,14 @@ julia> @chain db_table(db, :df_mem) begin @collect end 5×1 DataFrame - Row │ value - │ Int64? -─────┼──────── - 1 │ 1 - 2 │ 2 - 3 │ 3 - 4 │ 4 - 5 │ 5 + Row │ value + │ Int64 +─────┼─────── + 1 │ 1 + 2 │ 2 + 3 │ 3 + 4 │ 4 + 5 │ 5 julia> @chain db_table(db, :df_mem) begin @distinct @@ -524,19 +524,19 @@ julia> @chain db_table(db, :df_mem) begin @collect end 10×4 DataFrame - Row │ id groups value percent - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AA bb 1 0.1 - 2 │ AB aa 2 0.2 - 3 │ AC bb 3 0.3 - 4 │ AD aa 4 0.4 - 5 │ AE bb 5 0.5 - 6 │ AF aa 1 0.6 - 7 │ AG bb 2 0.7 - 8 │ AH aa 3 0.8 - 9 │ AI bb 4 0.9 - 10 │ AJ aa 5 1.0 + Row │ id groups value percent + │ String String Int64 Float64 +─────┼──────────────────────────────── + 1 │ AA bb 1 0.1 + 2 │ AB aa 2 0.2 + 3 │ AC bb 3 0.3 + 4 │ AD aa 4 0.4 + 5 │ AE bb 5 0.5 + 6 │ AF aa 1 0.6 + 7 │ AG bb 2 0.7 + 8 │ AH aa 3 0.8 + 9 │ AI bb 4 0.9 + 10 │ AJ aa 5 1.0 ``` """ @@ -577,19 +577,19 @@ julia> @chain db_table(db, :df_mem) begin @collect end 10×7 DataFrame - Row │ id groups value percent id2 category score - │ String? String? Int64? Float64? String? String? Int64? -─────┼──────────────────────────────────────────────────────────────── - 1 │ AA bb 1 0.1 AA X 88 - 2 │ AC bb 3 0.3 AC Y 92 - 3 │ AE bb 5 0.5 AE X 77 - 4 │ AG bb 2 0.7 AG Y 83 - 5 │ AI bb 4 0.9 AI X 95 - 6 │ AB aa 2 0.2 missing missing missing - 7 │ AD aa 4 0.4 missing missing missing - 8 │ AF aa 1 0.6 missing missing missing - 9 │ AH aa 3 0.8 missing missing missing - 10 │ AJ aa 5 1.0 missing missing missing + Row │ id groups value percent id2 category score + │ String String Int64 Float64 String? String? Int64? +─────┼──────────────────────────────────────────────────────────── + 1 │ AA bb 1 0.1 AA X 88 + 2 │ AC bb 3 0.3 AC Y 92 + 3 │ AE bb 5 0.5 AE X 77 + 4 │ AG bb 2 0.7 AG Y 83 + 5 │ AI bb 4 0.9 AI X 95 + 6 │ AB aa 2 0.2 missing missing missing + 7 │ AD aa 4 0.4 missing missing missing + 8 │ AF aa 1 0.6 missing missing missing + 9 │ AH aa 3 0.8 missing missing missing + 10 │ AJ aa 5 1.0 missing missing missing ``` """ @@ -630,16 +630,16 @@ julia> @chain db_table(db, :df_mem) begin @collect end 7×7 DataFrame - Row │ id groups value percent id2 category score - │ String? String? Int64? Float64? String? String? Int64? -─────┼───────────────────────────────────────────────────────────────── - 1 │ AA bb 1 0.1 AA X 88 - 2 │ AC bb 3 0.3 AC Y 92 - 3 │ AE bb 5 0.5 AE X 77 - 4 │ AG bb 2 0.7 AG Y 83 - 5 │ AI bb 4 0.9 AI X 95 - 6 │ missing missing missing missing AK Y 68 - 7 │ missing missing missing missing AM X 74 + Row │ id groups value percent id2 category score + │ String? String? Int64? Float64? String String Int64 +─────┼─────────────────────────────────────────────────────────────── + 1 │ AA bb 1 0.1 AA X 88 + 2 │ AC bb 3 0.3 AC Y 92 + 3 │ AE bb 5 0.5 AE X 77 + 4 │ AG bb 2 0.7 AG Y 83 + 5 │ AI bb 4 0.9 AI X 95 + 6 │ missing missing missing missing AK Y 68 + 7 │ missing missing missing missing AM X 74 ``` """ @@ -680,14 +680,14 @@ julia> @chain db_table(db, :df_mem) begin @collect end 5×7 DataFrame - Row │ id groups value percent id2 category score - │ String? String? Int64? Float64? String? String? Int64? -─────┼─────────────────────────────────────────────────────────────── - 1 │ AA bb 1 0.1 AA X 88 - 2 │ AC bb 3 0.3 AC Y 92 - 3 │ AE bb 5 0.5 AE X 77 - 4 │ AG bb 2 0.7 AG Y 83 - 5 │ AI bb 4 0.9 AI X 95 + Row │ id groups value percent id2 category score + │ String String Int64 Float64 String String Int64 +─────┼───────────────────────────────────────────────────────── + 1 │ AA bb 1 0.1 AA X 88 + 2 │ AC bb 3 0.3 AC Y 92 + 3 │ AE bb 5 0.5 AE X 77 + 4 │ AG bb 2 0.7 AG Y 83 + 5 │ AI bb 4 0.9 AI X 95 ``` """ const docstring_full_join = @@ -782,14 +782,14 @@ julia> @chain db_table(db, :df_mem) begin @collect end 5×4 DataFrame - Row │ id groups value percent - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AA bb 1 0.1 - 2 │ AC bb 3 0.3 - 3 │ AE bb 5 0.5 - 4 │ AG bb 2 0.7 - 5 │ AI bb 4 0.9 + Row │ id groups value percent + │ String String Int64 Float64 +─────┼──────────────────────────────── + 1 │ AA bb 1 0.1 + 2 │ AC bb 3 0.3 + 3 │ AE bb 5 0.5 + 4 │ AG bb 2 0.7 + 5 │ AI bb 4 0.9 ``` """ @@ -830,14 +830,14 @@ julia> @chain db_table(db, :df_mem) begin @collect end 5×4 DataFrame - Row │ id groups value percent - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AB aa 2 0.2 - 2 │ AD aa 4 0.4 - 3 │ AF aa 1 0.6 - 4 │ AH aa 3 0.8 - 5 │ AJ aa 5 1.0 + Row │ id groups value percent + │ String String Int64 Float64 +─────┼──────────────────────────────── + 1 │ AB aa 2 0.2 + 2 │ AD aa 4 0.4 + 3 │ AF aa 1 0.6 + 4 │ AH aa 3 0.8 + 5 │ AJ aa 5 1.0 ``` """ @@ -868,19 +868,19 @@ julia> @chain db_table(db, :df_mem) begin @collect end 10×4 DataFrame - Row │ id groups value new_name - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AA bb 1 0.1 - 2 │ AB aa 2 0.2 - 3 │ AC bb 3 0.3 - 4 │ AD aa 4 0.4 - 5 │ AE bb 5 0.5 - 6 │ AF aa 1 0.6 - 7 │ AG bb 2 0.7 - 8 │ AH aa 3 0.8 - 9 │ AI bb 4 0.9 - 10 │ AJ aa 5 1.0 + Row │ id groups value new_name + │ String String Int64 Float64 +─────┼───────────────────────────────── + 1 │ AA bb 1 0.1 + 2 │ AB aa 2 0.2 + 3 │ AC bb 3 0.3 + 4 │ AD aa 4 0.4 + 5 │ AE bb 5 0.5 + 6 │ AF aa 1 0.6 + 7 │ AG bb 2 0.7 + 8 │ AH aa 3 0.8 + 9 │ AI bb 4 0.9 + 10 │ AJ aa 5 1.0 ``` """ @@ -1040,10 +1040,10 @@ julia> @chain db_table(db, "df_mem") begin @collect end 1×3 DataFrame - Row │ id value percent - │ String? Int64? Float64? -─────┼─────────────────────────── - 1 │ AA 1 0.1 + Row │ id value percent + │ String Int64 Float64 +─────┼──────────────────────── + 1 │ AA 1 0.1 ``` """ @@ -1115,19 +1115,19 @@ julia> copy_to(db, df, "df_mem"); julia> @collect db_table(db, "df_mem") 10×4 DataFrame - Row │ id groups value percent - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AA bb 1 0.1 - 2 │ AB aa 2 0.2 - 3 │ AC bb 3 0.3 - 4 │ AD aa 4 0.4 - 5 │ AE bb 5 0.5 - 6 │ AF aa 1 0.6 - 7 │ AG bb 2 0.7 - 8 │ AH aa 3 0.8 - 9 │ AI bb 4 0.9 - 10 │ AJ aa 5 1.0 + Row │ id groups value percent + │ String String Int64 Float64 +─────┼──────────────────────────────── + 1 │ AA bb 1 0.1 + 2 │ AB aa 2 0.2 + 3 │ AC bb 3 0.3 + 4 │ AD aa 4 0.4 + 5 │ AE bb 5 0.5 + 6 │ AF aa 1 0.6 + 7 │ AG bb 2 0.7 + 8 │ AH aa 3 0.8 + 9 │ AI bb 4 0.9 + 10 │ AJ aa 5 1.0 ``` """ @@ -1158,10 +1158,10 @@ julia> @chain db_table(db, :df_mem) begin @collect end 1×4 DataFrame - Row │ id groups value percent - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AA bb 1 0.1 + Row │ id groups value percent + │ String String Int64 Float64 +─────┼──────────────────────────────── + 1 │ AA bb 1 0.1 ``` """ @@ -1211,58 +1211,22 @@ julia> copy_to(db, df, "df_mem"); julia> df_mem = db_table(db, "df_mem"); -julia> from_query(df_mem) -SQLQuery("", "df_mem", "", "", "", "", "", "", false, false, 4×4 DataFrame - Row │ name type current_selxn table_name - │ String? String? Int64 String -─────┼───────────────────────────────────────────── - 1 │ id VARCHAR 1 df_mem - 2 │ groups VARCHAR 1 df_mem - 3 │ value BIGINT 1 df_mem - 4 │ percent DOUBLE 1 df_mem, false, DuckDB.DB(":memory:"), TidierDB.CTE[], 0, nothing, "", "") -``` -""" - - -const docstring_t = -""" - t(query) - -Alias for from_query. Refer to SQL query without changing the underlying struct. This is an alternate and convenient way to refer to an exisiting DB table - -# Arguments -- `query`: The SQL query to reference - -# Examples -```julia - -julia> df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], - groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], - value = repeat(1:5, 2), - percent = 0.1:0.1:1.0); - -julia> db = connect(duckdb()); - -julia> copy_to(db, df, "df_mem"); - -julia> df_mem = db_table(db, "df_mem"); - julia> @chain t(df_mem) @collect 10×4 DataFrame - Row │ id groups value percent - │ String? String? Int64? Float64? -─────┼──────────────────────────────────── - 1 │ AA bb 1 0.1 - 2 │ AB aa 2 0.2 - 3 │ AC bb 3 0.3 - 4 │ AD aa 4 0.4 - 5 │ AE bb 5 0.5 - 6 │ AF aa 1 0.6 - 7 │ AG bb 2 0.7 - 8 │ AH aa 3 0.8 - 9 │ AI bb 4 0.9 - 10 │ AJ aa 5 1.0 + Row │ id groups value percent + │ String String Int64 Float64 +─────┼──────────────────────────────── + 1 │ AA bb 1 0.1 + 2 │ AB aa 2 0.2 + 3 │ AC bb 3 0.3 + 4 │ AD aa 4 0.4 + 5 │ AE bb 5 0.5 + 6 │ AF aa 1 0.6 + 7 │ AG bb 2 0.7 + 8 │ AH aa 3 0.8 + 9 │ AI bb 4 0.9 + 10 │ AJ aa 5 1.0 julia> query_part = @chain t(df_mem) @select groups:percent; @@ -1273,5 +1237,15 @@ julia> @chain t(query_part) @filter(value == 4) @collect ─────┼─────────────────────────── 1 │ aa 4 0.4 2 │ bb 4 0.9 + +julia> from_query(df_mem) +SQLQuery("", "df_mem", "", "", "", "", "", "", false, false, 4×4 DataFrame + Row │ name type current_selxn table_name + │ String? String? Int64 String +─────┼───────────────────────────────────────────── + 1 │ id VARCHAR 1 df_mem + 2 │ groups VARCHAR 1 df_mem + 3 │ value BIGINT 1 df_mem + 4 │ percent DOUBLE 1 df_mem, false, DuckDB.DB(":memory:"), TidierDB.CTE[], 0, nothing, "", "") ``` """ \ No newline at end of file diff --git a/src/structs.jl b/src/structs.jl index 256f5be..9ba4df3 100644 --- a/src/structs.jl +++ b/src/structs.jl @@ -103,7 +103,4 @@ function from_query(query::TidierDB.SQLQuery) return new_query end -""" -$docstring_t -""" t(table) = from_query(table) \ No newline at end of file From 807c20e5849e074d01e76f6afdf92f42cd061650 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Sun, 22 Sep 2024 22:59:39 -0400 Subject: [PATCH 7/7] small fix to allow "\$path" with json functions in duckdb when used withmutate --- NEWS.md | 7 ++++--- src/TidierDB.jl | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 51723d8..00f212d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,10 @@ # TidierDB.jl updates -## v0.3.4 - 2024 -TidierDB works with any exisiting SQL function +## v0.3.4 - 2024 2024-09-23 +TidierDB works with nearly any exisiting SQL function, now there are docs about it. - Docs on using any exisiting SQL function in TidierDB - Docs on user defined functions (UDFs) in TidierDB -- Adds `agg()` to use any aggregate built into a database to be used in `@mutate`. (`@summarize` continues to all aggregate SQL functions without `agg()`) +- Adds `agg()` to use any aggregate built into a database to be used in `@mutate`. support for `agg()` in across. (`@summarize` continues to all aggregate SQL functions without `agg()`) - Adds `t(query)` as a more efficient alternative to reference tables. ``` table = db_table(db, "name") @@ -12,6 +12,7 @@ table = db_table(db, "name") ``` - Bugfix: fixes MsSQL joins - Bugfix: window functions +- Bugfix: json paths supported for `json` DuckDB functions ## v0.3.3 - 2024-08-29 - Bugfix: `@mutate` allows type conversion as part of larger mutate expressions diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 1f12798..40efff4 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -154,10 +154,9 @@ function finalize_query(sqlquery::SQLQuery) complete_query = replace(complete_query, "&&" => " AND ", "||" => " OR ", "FROM )" => ")" , "SELECT SELECT " => "SELECT ", "SELECT SELECT " => "SELECT ", "DISTINCT SELECT " => "DISTINCT ", "SELECT SELECT SELECT " => "SELECT ", "PARTITION BY GROUP BY" => "PARTITION BY", "GROUP BY GROUP BY" => "GROUP BY", "HAVING HAVING" => "HAVING", - r"var\"(.*?)\"" => s"\1") + r"var\"(.*?)\"" => s"\1", r"\"\\\$" => "\"\$") complete_query = replace(complete_query, ", AS " => " AS ") - if current_sql_mode[] == postgres() || current_sql_mode[] == duckdb() || current_sql_mode[] == mysql() || current_sql_mode[] == mssql() || current_sql_mode[] == clickhouse() || current_sql_mode[] == athena() || current_sql_mode[] == gbq() || current_sql_mode[] == oracle() || current_sql_mode[] == snowflake() || current_sql_mode[] == databricks() complete_query = replace(complete_query, "\"" => "'", "==" => "=") end