golang数据框处理和数据清洗插件库gota的使用
Golang数据框处理和数据清洗插件库Gota的使用
Gota是一个用于Go语言的数据框(DataFrame)、序列(Series)和数据清洗操作的库。它提供了类似R和Python中pandas的功能,用于处理表格数据。
DataFrame基础
DataFrame是一个二维表格数据结构,可以看作是由多个Series组成的表格。它支持多种数据加载和操作方式。
加载数据
从Series创建DataFrame
df := dataframe.New(
series.New([]string{"b", "a"}, series.String, "COL.1"),
series.New([]int{1, 2}, series.Int, "COL.2"),
series.New([]float64{3.0, 4.0}, series.Float, "COL.3"),
)
从字符串记录创建
df := dataframe.LoadRecords(
[][]string{
[]string{"A", "B", "C", "D"},
[]string{"a", "4", "5.1", "true"},
[]string{"k", "5", "7.0", "true"},
[]string{"k", "4", "6.0", "true"},
[]string{"a", "2", "7.1", "false"},
},
)
从结构体切片创建
type User struct {
Name string
Age int
Accuracy float64
ignored bool // 未导出字段会被忽略
}
users := []User{
{"Aram", 17, 0.2, true},
{"Juan", 18, 0.8, true},
{"Ana", 22, 0.5, true},
}
df := dataframe.LoadStructs(users)
从CSV/JSON创建
csvStr := `
Country,Date,Age,Amount,Id
"United States",2012-02-01,50,112.1,01234
"United States",2012-02-01,32,321.31,54320
"United Kingdom",2012-02-01,17,18.2,12345
`
df := dataframe.ReadCSV(strings.NewReader(csvStr))
jsonStr := `[{"COL.2":1,"COL.3":3},{"COL.1":5,"COL.2":2,"COL.3":2},{"COL.1":6,"COL.2":3,"COL.3":1}]`
df := dataframe.ReadJSON(strings.NewReader(jsonStr))
数据操作
子集选择
// 选择行
sub := df.Subset([]int{0, 2})
// 选择列
sel1 := df.Select([]int{0, 2})
sel2 := df.Select([]string{"A", "C"})
更新值
df2 := df.Set(
[]int{0, 2},
dataframe.LoadRecords(
[][]string{
[]string{"A", "B", "C", "D"},
[]string{"b", "4", "6.0", "true"},
[]string{"c", "3", "6.0", "false"},
},
),
)
过滤数据
// 简单过滤
fil := df.Filter(
dataframe.F{"A", series.Eq, "a"},
dataframe.F{"B", series.Greater, 4},
)
// 使用OR聚合过滤
filAlt := df.FilterAggregation(
dataframe.Or,
dataframe.F{"A", series.Eq, "a"},
dataframe.F{"B", series.Greater, 4},
)
// 使用AND聚合过滤
fil := df.FilterAggregation(
dataframe.And,
dataframe.F{"A", series.Eq, "a"},
dataframe.F{"D", series.Eq, true},
)
分组和聚合
groups := df.GroupBy("key1", "key2") // 按key1和key2列分组
aggre := groups.Aggregation([]AggregationType{Aggregation_MAX, Aggregation_MIN}, []string{"values", "values2"})
排序
sorted := df.Arrange(
dataframe.Sort("A"), // 升序排列
dataframe.RevSort("B"), // 降序排列
)
添加/修改列
// 修改列C
mut := df.Mutate(
series.New([]string{"a", "b", "c", "d"}, series.String, "C"),
)
// 添加新列E
mut2 := df.Mutate(
series.New([]string{"a", "b", "c", "d"}, series.String, "E"),
)
连接操作
df := dataframe.LoadRecords(
[][]string{
[]string{"A", "B", "C", "D"},
[]string{"a", "4", "5.1", "true"},
[]string{"k", "5", "7.0", "true"},
[]string{"k", "4", "6.0", "true"},
[]string{"a", "2", "7.1", "false"},
},
)
df2 := dataframe.LoadRecords(
[][]string{
[]string{"A", "F", "D"},
[]string{"1", "1", "true"},
[]string{"4", "2", "false"},
[]string{"2", "8", "false"},
[]string{"5", "9", "false"},
},
)
join := df.InnerJoin(df2, "D") // 内连接
链式操作
Gota支持链式操作,可以连续调用多个方法:
a = a.Rename("Origin", "Country").
Filter(dataframe.F{"Age", "<", 50}).
Filter(dataframe.F{"Origin", "==", "United States"}).
Select("Id", "Origin", "Date").
Subset([]int{1, 3})
if a.Err != nil {
log.Fatal("Oh noes!")
}
Series类型
Series是DataFrame的构建块,支持以下类型:
- Int
- Float
- String
- Bool
许可证
Gota使用Apache License 2.0许可证。
更多关于golang数据框处理和数据清洗插件库gota的使用的实战教程也可以访问 https://www.itying.com/category-94-b0.html
1 回复
更多关于golang数据框处理和数据清洗插件库gota的使用的实战系列教程也可以访问 https://www.itying.com/category-94-b0.html
Gota:Go语言中的数据框处理与数据清洗库
Gota 是一个用 Go 语言实现的数据框(DataFrame)处理库,类似于 Python 中的 Pandas 或 R 中的 data.frame。它提供了高效的数据操作和清洗功能,特别适合处理结构化数据。
安装 Gota
go get github.com/go-gota/gota/dataframe
go get github.com/go-gota/gota/series
基本用法
1. 创建 DataFrame
package main
import (
"fmt"
"github.com/go-gota/gota/dataframe"
"github.com/go-gota/gota/series"
)
func main() {
// 从 map 创建 DataFrame
df := dataframe.New(
series.New([]string{"Alice", "Bob", "Charlie"}, series.String, "Name"),
series.New([]int{25, 30, 35}, series.Int, "Age"),
series.New([]float64{5.5, 6.1, 5.9}, series.Float, "Height"),
series.New([]bool{false, true, false}, series.Bool, "Married"),
)
fmt.Println(df)
}
2. 从 CSV 加载数据
func loadCSV() {
file, err := os.Open("data.csv")
if err != nil {
log.Fatal(err)
}
defer file.Close()
df := dataframe.ReadCSV(file)
fmt.Println(df)
}
3. 数据筛选
func filterData() {
df := dataframe.LoadMaps([]map[string]interface{}{
{"Name": "Alice", "Age": 25, "City": "NY"},
{"Name": "Bob", "Age": 30, "City": "LA"},
{"Name": "Charlie", "Age": 35, "City": "SF"},
})
// 筛选年龄大于30的记录
filtered := df.Filter(
dataframe.F{Colname: "Age", Comparator: series.Greater, Comparando: 30},
)
fmt.Println(filtered)
// 多条件筛选
filtered2 := df.FilterAggregation(
dataframe.And,
dataframe.F{Colname: "Age", Comparator: series.Greater, Comparando: 25},
dataframe.F{Colname: "City", Comparator: series.Eq, Comparando: "SF"},
)
fmt.Println(filtered2)
}
4. 数据排序
func sortData() {
df := dataframe.LoadMaps([]map[string]interface{}{
{"Name": "Alice", "Score": 85},
{"Name": "Bob", "Score": 92},
{"Name": "Charlie", "Score": 78},
})
// 按分数降序排列
sorted := df.Arrange(dataframe.Sort("Score", false))
fmt.Println(sorted)
}
5. 数据分组与聚合
func groupData() {
df := dataframe.LoadMaps([]map[string]interface{}{
{"City": "NY", "Product": "A", "Sales": 100},
{"City": "NY", "Product": "B", "Sales": 150},
{"City": "LA", "Product": "A", "Sales": 200},
{"City": "LA", "Product": "B", "Sales": 250},
})
// 按城市分组并计算销售总额
grouped := df.GroupBy("City").Aggregation(
[]dataframe.AggregationType{dataframe.Aggregation_SUM},
[]string{"Sales"},
)
fmt.Println(grouped)
}
6. 处理缺失值
func handleMissing() {
df := dataframe.LoadMaps([]map[string]interface{}{
{"A": 1, "B": 4.5, "C": "x"},
{"A": nil, "B": nil, "C": "y"},
{"A": 3, "B": 6.7, "C": nil},
})
// 删除包含缺失值的行
cleaned := df.DropNA()
fmt.Println("After dropping NA:", cleaned)
// 填充缺失值
filled := df.Mutate(
series.New([]int{1, 2, 3}, series.Int, "A").FillNaN(0),
)
fmt.Println("After filling NA:", filled)
}
7. 数据转换
func transformData() {
df := dataframe.LoadMaps([]map[string]interface{}{
{"Temperature": 32.0, "Scale": "F"},
{"Temperature": 0.0, "Scale": "C"},
{"Temperature": 100.0, "Scale": "C"},
})
// 添加新列 - 转换为开尔文温度
df = df.Mutate(
series.New(df.Col("Temperature").Float(), series.Float, "Temp_K").Map(
func(e series.Element) series.Element {
temp := e.Float()
scale := df.Col("Scale").Elem(e.Index()).String()
if scale == "F" {
// 华氏度转开尔文: (F - 32) × 5/9 + 273.15
e.Set((temp-32)*5/9 + 273.15)
} else if scale == "C" {
// 摄氏度转开尔文: C + 273.15
e.Set(temp + 273.15)
}
return e
},
),
)
fmt.Println(df)
}
高级功能
1. 数据连接
func joinData() {
df1 := dataframe.LoadMaps([]map[string]interface{}{
{"ID": 1, "Name": "Alice"},
{"ID": 2, "Name": "Bob"},
})
df2 := dataframe.LoadMaps([]map[string]interface{}{
{"ID": 1, "Age": 25},
{"ID": 2, "Age": 30},
{"ID": 3, "Age": 35},
})
// 内连接
innerJoin := df1.InnerJoin(df2, "ID")
fmt.Println("Inner Join:", innerJoin)
// 左连接
leftJoin := df1.LeftJoin(df2, "ID")
fmt.Println("Left Join:", leftJoin)
}
2. 描述性统计
func describeData() {
df := dataframe.LoadMaps([]map[string]interface{}{
{"Value": 10},
{"Value": 20},
{"Value": 30},
{"Value": 40},
{"Value": 50},
})
// 获取描述性统计
description := df.Describe()
fmt.Println(description)
}
3. 数据采样
func sampleData() {
df := dataframe.LoadMaps([]map[string]interface{}{
{"A": 1}, {"A": 2}, {"A": 3}, {"A": 4}, {"A": 5},
{"A": 6}, {"A": 7}, {"A": 8}, {"A": 9}, {"A": 10},
})
// 随机采样3行
sampled := df.Sample(3)
fmt.Println(sampled)
}
性能提示
- 对于大型数据集,考虑使用
dataframe.ReadCSV
的HasHeader
和Types
选项来优化加载 - 链式操作会创建新的 DataFrame,可能会影响性能,可以合并多个操作为一个
- 对于非常大数据集,可能需要分批处理
Gota 提供了强大的数据操作能力,虽然不如 Python 的 Pandas 功能全面,但在 Go 生态中是最接近的替代方案,特别适合需要在 Go 项目中处理结构化数据的场景。