seaweedfs/weed/mq/schema/to_parquet_schema.go

76 lines
2.5 KiB
Go
Raw Normal View History

2024-04-18 14:49:21 +08:00
package schema
import (
"fmt"
parquet "github.com/parquet-go/parquet-go"
"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
)
func ToParquetSchema(topicName string, recordType *schema_pb.RecordType) (*parquet.Schema, error) {
rootNode, err := toParquetFieldTypeRecord(recordType)
if err != nil {
return nil, fmt.Errorf("failed to convert record type to parquet schema: %v", err)
}
2024-04-22 15:42:18 +08:00
// Fields are sorted by name, so the value should be sorted also
// the sorting is inside parquet.`func (g Group) Fields() []Field`
2024-04-18 14:49:21 +08:00
return parquet.NewSchema(topicName, rootNode), nil
}
2024-04-19 13:41:12 +08:00
func toParquetFieldType(fieldType *schema_pb.Type) (dataType parquet.Node, err error) {
switch fieldType.Kind.(type) {
2024-04-18 14:49:21 +08:00
case *schema_pb.Type_ScalarType:
2024-04-19 13:41:12 +08:00
dataType, err = toParquetFieldTypeScalar(fieldType.GetScalarType())
2024-04-25 14:04:47 +08:00
dataType = parquet.Optional(dataType)
2024-04-18 14:49:21 +08:00
case *schema_pb.Type_RecordType:
2024-04-19 13:41:12 +08:00
dataType, err = toParquetFieldTypeRecord(fieldType.GetRecordType())
2024-04-25 14:32:35 +08:00
dataType = parquet.Optional(dataType)
case *schema_pb.Type_ListType:
2024-04-19 13:41:12 +08:00
dataType, err = toParquetFieldTypeList(fieldType.GetListType())
2024-04-18 14:49:21 +08:00
default:
2024-04-19 13:41:12 +08:00
return nil, fmt.Errorf("unknown field type: %T", fieldType.Kind)
2024-04-18 14:49:21 +08:00
}
return dataType, err
}
2024-04-19 13:41:12 +08:00
func toParquetFieldTypeList(listType *schema_pb.ListType) (parquet.Node, error) {
elementType, err := toParquetFieldType(listType.ElementType)
if err != nil {
return nil, err
}
2024-04-25 14:04:47 +08:00
return parquet.Repeated(elementType), nil
2024-04-19 13:41:12 +08:00
}
2024-04-18 14:49:21 +08:00
func toParquetFieldTypeScalar(scalarType schema_pb.ScalarType) (parquet.Node, error) {
switch scalarType {
2024-05-02 23:59:22 +08:00
case schema_pb.ScalarType_BOOL:
2024-04-18 14:49:21 +08:00
return parquet.Leaf(parquet.BooleanType), nil
2024-05-02 23:59:22 +08:00
case schema_pb.ScalarType_INT32:
2024-04-18 14:49:21 +08:00
return parquet.Leaf(parquet.Int32Type), nil
2024-05-02 23:59:22 +08:00
case schema_pb.ScalarType_INT64:
2024-04-18 14:49:21 +08:00
return parquet.Leaf(parquet.Int64Type), nil
2024-05-03 02:14:58 +08:00
case schema_pb.ScalarType_FLOAT:
2024-04-18 14:49:21 +08:00
return parquet.Leaf(parquet.FloatType), nil
2024-05-03 02:14:58 +08:00
case schema_pb.ScalarType_DOUBLE:
2024-04-18 14:49:21 +08:00
return parquet.Leaf(parquet.DoubleType), nil
case schema_pb.ScalarType_BYTES:
return parquet.Leaf(parquet.ByteArrayType), nil
case schema_pb.ScalarType_STRING:
2024-04-22 15:42:18 +08:00
return parquet.Leaf(parquet.ByteArrayType), nil
2024-04-18 14:49:21 +08:00
default:
return nil, fmt.Errorf("unknown scalar type: %v", scalarType)
}
}
func toParquetFieldTypeRecord(recordType *schema_pb.RecordType) (parquet.Node, error) {
recordNode := parquet.Group{}
for _, field := range recordType.Fields {
2024-04-19 13:41:12 +08:00
parquetFieldType, err := toParquetFieldType(field.Type)
2024-04-18 14:49:21 +08:00
if err != nil {
return nil, err
}
recordNode[field.Name] = parquetFieldType
}
return recordNode, nil
}