Skip to content

Commit

Permalink
improve read_XLSX type parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
drizk1 committed Sep 6, 2024
1 parent c9c1949 commit a675015
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 69 deletions.
13 changes: 7 additions & 6 deletions src/docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,12 @@ julia> write_xlsx(("REPORT_A" => df, "REPORT_B" => df2); path="xlsxtest.xlsx", o
julia> read_xlsx("xlsxtest.xlsx", sheet = "REPORT_A", skip = 1, n_max = 4, missingstring = [2])
3×3 DataFrame
Row │ integers strings floats
Any String Float64
─────┼─────────────────────────────────────────
1 │ missing Package makes 20.3
2 │ 3 File reading/writing 30.4
3 │ 4 even smoother 40.5
Row │ integers strings floats
Int64? String? Float64?
─────┼─────────────────────────────────────────
1 │ missing Package makes 20.3
2 │ 3 File reading/writing 30.4
3 │ 4 even smoother 40.5
```
"""

Expand Down Expand Up @@ -473,6 +473,7 @@ julia> write_sav(df, "test.por")
2 │ por 10.2
```
"""

const docstring_write_sas =
"""
write_sas(df, path)
Expand Down
113 changes: 50 additions & 63 deletions src/xlfiles.jl
Original file line number Diff line number Diff line change
@@ -1,37 +1,43 @@
function infer_type(value)
if isa(value, Missing)
return Missing
elseif isa(value, Number)
if isa(value, Int) || isa(value, Bool)
return Int
else
return Float64
end
elseif isa(value, DateTime)
return DateTime
elseif isa(value, Time)
return Time
elseif isa(value, Date)
function infer_column_type(values)
nonmissing_values = filter(x -> x !== missing, values)
first_values = nonmissing_values[1:min(5, length(nonmissing_values))]

# Check if all values are already integers
if all(x -> isa(x, Int), first_values)
return Int
# Check if all values are already floats
elseif all(x -> isa(x, Float64), first_values)
return Float64
# Check if all values are integers or can be parsed as integers
elseif all(x -> isa(x, Int) || tryparse(Int, string(x)) !== nothing, first_values)
return Int
# Check if all values are floats or can be parsed as floats
elseif all(x -> isa(x, Float64) || tryparse(Float64, string(x)) !== nothing, first_values)
return Float64
# Check if all values are dates or can be parsed as dates
elseif all(x -> isa(x, Date) || tryparse(Date, string(x), dateformat"yyyy-mm-dd") !== nothing, first_values)
return Date
# Default to String
else
return String
end
end

function convert_column(column)
non_missing_values = filter(!ismissing, column)
if isempty(non_missing_values)
return column # Return as-is if all values are missing
end

target_type = reduce((x, y) -> x === y ? x : String, map(infer_type, non_missing_values))
try
return target_type == Missing ? column : convert(Vector{target_type}, column)
catch
return column # Fallback to original if conversion fails
# Function to convert a column to the inferred type
# Function to convert a column to the inferred type
function convert_column(col, inferred_type)
if inferred_type == Int
return [x === missing ? missing : isa(x, Int) ? x : tryparse(Int, string(x)) for x in col]
elseif inferred_type == Float64
return [x === missing ? missing : isa(x, Float64) ? x : tryparse(Float64, string(x)) for x in col]
elseif inferred_type == Date
return [x === missing ? missing : isa(x, Date) ? x : tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
else
return [x === missing ? missing : convert(String, x) for x in col]
end
end


"""
$docstring_read_xlsx
"""
Expand All @@ -40,70 +46,56 @@ function read_xlsx(
sheet = nothing,
range = nothing,
col_names = true,
col_types = nothing,
missingstring = "",
trim_ws = true,
skip = 0,
n_max = Inf,
guess_max = nothing)


if startswith(path, "http://") || startswith(path, "https://")
# Fetch the content from the URL
n_max = Inf
)
# Fetch the Excel file (from URL or local path)
xf = if startswith(path, "http://") || startswith(path, "https://")
response = HTTP.get(path)

# Ensure the request was successful
if response.status != 200
error("Failed to fetch the Excel file: HTTP status code ", response.status)
end

# Read the Excel data from the fetched content
xf = XLSX.readxlsx(IOBuffer(response.body))
XLSX.readxlsx(IOBuffer(response.body))
else
# Read from a local file
xf = XLSX.readxlsx(path)
XLSX.readxlsx(path)
end
# Determine the sheet to read from

# Determine which sheet to read
sheet_to_read = isnothing(sheet) ? first(XLSX.sheetnames(xf)) : sheet

# Read the specified range or the entire sheet if range is not specified
if isnothing(range)
data = XLSX.eachtablerow(xf[sheet_to_read]) |> DataFrame
else
data = XLSX.readdata(path, sheet_to_read, range) |> DataFrame
end
# Read the table data from the specified range or full sheet
table_data = XLSX.gettable(xf[sheet_to_read])
data = DataFrame(table_data)

# Initial column name processing
if col_names == true && !isnothing(range)
col_names_row = XLSX.readdata(path, sheet_to_read, replace(range, r"[0-9]+:[0-9]+$" => "1:1"))[1, :]
rename!(data, Symbol.(col_names_row))
data = data[2:end, :]
elseif col_names != true && col_names != false
rename!(data, Symbol.(col_names))
elseif col_names == false
rename!(data, Symbol.(:auto))
# Infer and apply column types based on the first 5 rows
for col in names(data)
col_values = data[!, col]
inferred_type = infer_column_type(col_values)
data[!, col] = convert_column(col_values, inferred_type)
end

# Skipping rows
if skip > 0
data = data[(skip+1):end, :]
end

# Limiting number of rows
# Limiting the number of rows
if !isinf(n_max)
data = data[1:min(n_max, nrow(data)), :]
end

# Replace missing strings with `missing` if applicable
if !isempty(missingstring)
for missing_value in missingstring
for col in names(data)
# Apply replacement on the entire column for each missing string value
data[!, col] = replace(data[!, col], missing_value => missing)
end
end
end

# Trim whitespace
# Trim whitespace if requested
if trim_ws
for col in names(data)
if eltype(data[!, col]) == String
Expand All @@ -112,11 +104,6 @@ function read_xlsx(
end
end

# Automatic type conversion based on inferred types
for col in names(data)
data[!, col] = convert_column(data[!, col])
end

return data
end

Expand Down

0 comments on commit a675015

Please sign in to comment.