From 68950af8a6c758816254b6c74cac56b9d3f18cdc Mon Sep 17 00:00:00 2001 From: hnwyllmm Date: Tue, 19 May 2026 20:08:58 +0800 Subject: [PATCH 1/6] fixed android apk build --- package/apk/seekdb-apk-build.sh | 5 +- src/observer/embed/CMakeLists.txt | 22 +- src/sql/engine/basic/ob_select_into_op.cpp | 4780 +++++++++-------- .../ob_external_table_access_service.cpp | 1709 +++--- 4 files changed, 3261 insertions(+), 3255 deletions(-) diff --git a/package/apk/seekdb-apk-build.sh b/package/apk/seekdb-apk-build.sh index 1c8896424..49658c524 100644 --- a/package/apk/seekdb-apk-build.sh +++ b/package/apk/seekdb-apk-build.sh @@ -497,8 +497,8 @@ if [[ "$DO_MAKE" == true ]]; then cd "$TOPDIR" echo "[seekdb-apk-build] ./build.sh clean" ./build.sh clean - echo "[seekdb-apk-build] ./build.sh release --android --init" - ./build.sh release --android --init + echo "[seekdb-apk-build] ./build.sh release --android --init -DBUILD_EMBED_MODE=ON" + ./build.sh release --android --init -DBUILD_EMBED_MODE=ON cd "$TOPDIR" _jobs="$(getconf _NPROCESSORS_ONLN 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)" if [[ "$WITH_JNI" == true ]]; then @@ -614,3 +614,4 @@ if [[ "$DO_INSTALL" == true ]]; then fi echo "[package_embedded_apk] done" + diff --git a/src/observer/embed/CMakeLists.txt b/src/observer/embed/CMakeLists.txt index c557a0861..b17dbfa4c 100644 --- a/src/observer/embed/CMakeLists.txt +++ b/src/observer/embed/CMakeLists.txt @@ -35,26 +35,26 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android") target_link_libraries(embedded_client PRIVATE seekdb_embed_c) endif() -if(BUILD_EMBED_MODE) +if(BUILD_EMBED_MODE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android") # Set target Python version, can be overridden by CMake parameter if(NOT DEFINED PYTHON_VERSION) set(PYTHON_VERSION "3.8") endif() set(Python3_FOUND FALSE) - + message(STATUS "Target Python version: ${PYTHON_VERSION}") - + # First try to find the specified Python version from system message(STATUS "Searching for Python ${PYTHON_VERSION} in system first...") find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Interpreter Development.Module QUIET) if(Python3_FOUND) message(STATUS "Found system Python ${PYTHON_VERSION}, using it for embed module") endif() - + # If system doesn't have the specified Python version, try to find from pyenv if(NOT Python3_FOUND) message(STATUS "No system Python ${PYTHON_VERSION} found, trying pyenv...") - + # Method 1: Check if pyenv version can be specified via environment variable if(DEFINED ENV{PYENV_VERSION}) execute_process( @@ -79,7 +79,7 @@ if(BUILD_EMBED_MODE) endif() endif() endif() - + # Method 2: Directly try to find the specified version in pyenv if(NOT Python3_FOUND) execute_process( @@ -119,7 +119,7 @@ if(BUILD_EMBED_MODE) endif() endif() endif() - + # Method 3: Directly find the specified version through PYENV_ROOT if(NOT Python3_FOUND AND DEFINED ENV{PYENV_ROOT}) set(PYENV_SPECIFIC_PYTHON "$ENV{PYENV_ROOT}/versions/${PYTHON_VERSION}/bin/python") @@ -130,12 +130,12 @@ if(BUILD_EMBED_MODE) endif() endif() endif() - + # If still no suitable Python version found, error and exit if(NOT Python3_FOUND) message(FATAL_ERROR "No suitable Python ${PYTHON_VERSION} found. Please install Python ${PYTHON_VERSION} or configure pyenv with Python ${PYTHON_VERSION}.") endif() - + # Verify the final Python choice if(Python3_EXECUTABLE) execute_process( @@ -145,7 +145,7 @@ if(BUILD_EMBED_MODE) ) message(STATUS "Final Python choice: ${Python3_EXECUTABLE} (${PYTHON_VERSION_OUTPUT})") endif() - + message(STATUS "Building embed module with Python ${Python3_EXECUTABLE}") execute_process( COMMAND "${Python3_EXECUTABLE}" -c @@ -194,7 +194,7 @@ if(BUILD_EMBED_MODE) SUFFIX ".so" OUTPUT_NAME "${libname}" ) - + target_link_libraries(${libname} PUBLIC oceanbase_static $<$>:-static-libgcc> diff --git a/src/sql/engine/basic/ob_select_into_op.cpp b/src/sql/engine/basic/ob_select_into_op.cpp index d11b9877c..293b8cf29 100644 --- a/src/sql/engine/basic/ob_select_into_op.cpp +++ b/src/sql/engine/basic/ob_select_into_op.cpp @@ -14,2395 +14,2399 @@ * limitations under the License. */ -#define USING_LOG_PREFIX SQL_ENG - -#include -#include - -#include "ob_select_into_op.h" -#include "sql/engine/cmd/ob_variable_set_executor.h" -#include "lib/charset/ob_charset_string_helper.h" -#include "sql/engine/px/ob_px_sqc_handler.h" -#include "sql/engine/expr/ob_expr_json_func_helper.h" -#include "lib/udt/ob_collection_type.h" -#include "share/config/ob_server_config.h" - -#ifndef OB_BUILD_EMBED_MODE -#include -#include -#include -#include -#include -#include -#include - -#define ARROW_FAIL(statement) (OB_UNLIKELY(!(statement).ok())) - -#endif - -namespace oceanbase -{ -using namespace common; -namespace sql -{ - -OB_SERIALIZE_MEMBER(ObSelectIntoOpInput, task_id_, sqc_id_); -OB_SERIALIZE_MEMBER((ObSelectIntoSpec, ObOpSpec), into_type_, user_vars_, outfile_name_, - field_str_, // FARM COMPAT WHITELIST FOR filed_str_: renamed - line_str_, closed_cht_, is_optional_, select_exprs_, is_single_, max_file_size_, - escaped_cht_, cs_type_, parallel_, file_partition_expr_, buffer_size_, is_overwrite_, - external_properties_, external_partition_, alias_names_); - - -int ObSelectIntoOp::inner_open() -{ - int ret = OB_SUCCESS; - ObSQLSessionInfo *session = NULL; - if (OB_ISNULL(session = ctx_.get_my_session())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get session failed", K(ret)); - } else { - // since we call get_next_row in inner_open, we have to set opened_ first in avoid to a infinite loop. - opened_ = true; - if (OB_FAIL(session->get_sql_select_limit(top_limit_cnt_))) { - LOG_WARN("fail tp get sql select limit", K(ret)); - } - } - if (OB_SUCC(ret) && !MY_SPEC.external_properties_.str_.empty()) { - if (OB_FAIL(external_properties_.load_from_string(MY_SPEC.external_properties_.str_, - ctx_.get_allocator()))) { - LOG_WARN("failed to load external properties", K(ret)); - } else { - format_type_ = external_properties_.format_type_; - } - } - if (OB_SUCC(ret)) { - switch (format_type_) - { - case ObExternalFileFormat::FormatType::CSV_FORMAT: - { - if (OB_FAIL(init_csv_env())) { - LOG_WARN("failed to init csv env", K(ret)); - } - break; - } - case ObExternalFileFormat::FormatType::ODPS_FORMAT: - { - if (!GCONF._use_odps_jni_connector) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support odps format", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support odps format", K(ret)); - } - break; - } - case ObExternalFileFormat::FormatType::PARQUET_FORMAT: - { -#ifndef OB_BUILD_EMBED_MODE - if (OB_FAIL(init_parquet_env())) { - LOG_WARN("failed to init parquet env", K(ret)); - } -#endif - break; - } - case ObExternalFileFormat::FormatType::ORC_FORMAT: - { - ret = OB_NOT_SUPPORTED; - break; - } - default: - { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support select into type", K(format_type_)); - } - } - } - return ret; -} - -int ObSelectIntoOp::init_csv_env() -{ - int ret = OB_SUCCESS; - ObSQLSessionInfo *session = NULL; - set_csv_format_options(); - if (OB_ISNULL(session = ctx_.get_my_session())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get session failed", K(ret)); - } else if (OB_FAIL(init_env_common())) { - LOG_WARN("failed to init env common", K(ret)); - } else if (OB_FAIL(prepare_escape_printer())) { - LOG_WARN("failed to calc escape info", K(ret)); - } else { - if (external_properties_.csv_format_.compression_algorithm_ != CsvCompressType::NONE) { - has_compress_ = true; - } - // setup binary output format for bit/binary - switch (external_properties_.csv_format_.binary_format_) { - case ObCSVGeneralFormat::ObCSVBinaryFormat::DEFAULT: - print_params_.binary_string_print_hex_ = false; - break; - case ObCSVGeneralFormat::ObCSVBinaryFormat::HEX: - print_params_.binary_string_print_hex_ = true; - break; - case ObCSVGeneralFormat::ObCSVBinaryFormat::BASE64: - print_params_.binary_string_print_base64_ = true; - break; - default: - ret = OB_ERR_UNEXPECTED; - LOG_WARN("failed to set csv binary output format", K(ret)); - } - print_params_.tz_info_ = session->get_timezone_info(); - print_params_.use_memcpy_ = true; - print_params_.cs_type_ = cs_type_; - } - //create buffer - if (OB_SUCC(ret) && T_INTO_OUTFILE == MY_SPEC.into_type_ && OB_FAIL(create_shared_buffer_for_data_writer())) { - LOG_WARN("failed to create buffer for data writer", K(ret)); - } - return ret; -} - -void ObSelectIntoOp::set_csv_format_options() -{ - if (MY_SPEC.external_properties_.str_.empty()) { - field_str_ = MY_SPEC.field_str_; - line_str_ = MY_SPEC.line_str_; - has_enclose_ = MY_SPEC.closed_cht_.get_val_len() > 0; - char_enclose_ = has_enclose_ ? MY_SPEC.closed_cht_.get_char().ptr()[0] : 0; - is_optional_ = MY_SPEC.is_optional_; - has_escape_ = MY_SPEC.escaped_cht_.get_val_len() > 0; - char_escape_ = has_escape_ ? MY_SPEC.escaped_cht_.get_char().ptr()[0] : 0; - cs_type_ = MY_SPEC.cs_type_; - } else { - is_optional_ = external_properties_.csv_format_.is_optional_; - cs_type_ = ObCharset::get_default_collation(external_properties_.csv_format_.cs_type_); - field_str_.set_varchar(external_properties_.csv_format_.field_term_str_); - field_str_.set_collation_type(cs_type_); - line_str_.set_varchar(external_properties_.csv_format_.line_term_str_); - line_str_.set_collation_type(cs_type_); - if (external_properties_.csv_format_.field_enclosed_char_ == INT64_MAX) { // null - has_enclose_ = false; - char_enclose_ = 0; - } else { - has_enclose_ = true; - char_enclose_ = external_properties_.csv_format_.field_enclosed_char_; - } - if (external_properties_.csv_format_.field_escaped_char_ == INT64_MAX) { // null - has_escape_ = false; - char_escape_ = 0; - } else { - has_escape_ = true; - char_escape_ = external_properties_.csv_format_.field_escaped_char_; - } - } -} - -int ObSelectIntoOp::init_env_common() -{ - int ret = OB_SUCCESS; - ObPhysicalPlanCtx *phy_plan_ctx = NULL; - bool need_check = false; - file_name_ = MY_SPEC.outfile_name_; - do_partition_ = MY_SPEC.file_partition_expr_ == NULL ? false : true; - if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get phy_plan_ctx failed", K(ret)); - } else if (OB_FAIL(ObSQLUtils::get_param_value(MY_SPEC.outfile_name_, - phy_plan_ctx->get_param_store(), - file_name_, - need_check))) { - LOG_WARN("get param value failed", K(ret)); - } else if (OB_FAIL(calc_url_and_set_access_info())) { - LOG_WARN("failed to calc basic url and set device handle", K(ret)); - } else if (OB_FAIL(check_has_lob_or_json())) { - LOG_WARN("failed to check has lob", K(ret)); - } else if (has_coll_ && MY_SPEC.into_type_ == T_INTO_VARIABLES) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "select array/map into variables"); - } else if (do_partition_ - && OB_FAIL(partition_map_.create(128, ObLabel("SelectInto"), ObLabel("SelectInto"), MTL_ID()))) { - LOG_WARN("failed to create hashmap", K(ret)); - } else if (MY_SPEC.select_exprs_.count() != MY_SPEC.alias_names_.strs_.count()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected column count", K(MY_SPEC.select_exprs_.count()), - K(MY_SPEC.alias_names_.strs_.count()), K(ret)); - } - return ret; -} - -//calc first data_writer.url_ and basic_url_ -int ObSelectIntoOp::calc_url_and_set_access_info() -{ - int ret = OB_SUCCESS; - const ObItemType into_type = MY_SPEC.into_type_; - ObString path = file_name_.get_varchar().trim(); - if (path.prefix_match_ci(OB_S3_PREFIX)) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "S3 storage"); - LOG_WARN("S3 storage is not supported", K(ret)); - } else if (path.prefix_match_ci(OB_AZBLOB_PREFIX)) { - file_location_ = IntoFileLocation::REMOTE_AZBLOB; - } else if (path.prefix_match_ci(OB_OSS_PREFIX)) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "OSS storage"); - LOG_WARN("OSS storage is not supported", K(ret)); - } else if (path.prefix_match_ci(OB_COS_PREFIX)) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "COS storage"); - LOG_WARN("COS storage is not supported", K(ret)); - } else if (path.prefix_match_ci(OB_HDFS_PREFIX)) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "HDFS storage"); - LOG_WARN("HDFS storage is not supported", K(ret)); - } else { - file_location_ = IntoFileLocation::SERVER_DISK; - } - if (file_location_ == IntoFileLocation::SERVER_DISK && do_partition_) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support partition option on server disk", K(ret)); - LOG_USER_ERROR(OB_NOT_SUPPORTED, "partition option on server disk"); - } else if (T_INTO_OUTFILE == into_type && !MY_SPEC.is_single_ && OB_FAIL(calc_first_file_path(path))) { - LOG_WARN("failed to calc first file path", K(ret)); - } else if (file_location_ != IntoFileLocation::SERVER_DISK) { - ObString temp_url = path.split_on('?'); - temp_url.trim(); - ObString storage_info; - if (OB_FAIL(ob_write_string(ctx_.get_allocator(), temp_url, basic_url_, true))) { - LOG_WARN("failed to append string", K(ret)); - } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), path, storage_info, true))) { - LOG_WARN("failed to append string", K(ret)); - } else if (OB_FAIL(access_info_.set(basic_url_.ptr(), storage_info.ptr()))) { - LOG_WARN("failed to set access info", K(ret), K(path)); - } else if (basic_url_.empty() || !access_info_.is_valid()) { - ret = OB_FILE_NOT_EXIST; - LOG_WARN("file path not exist", K(ret), K(basic_url_), K(access_info_)); - } - } else { // IntoFileLocation::SERVER_DISK - if (OB_FAIL(ob_write_string(ctx_.get_allocator(), path, basic_url_, true))) { - LOG_WARN("failed to write string", K(ret)); - } - } - if (OB_SUCC(ret) && (T_INTO_OUTFILE == into_type || T_INTO_DUMPFILE == into_type) - && IntoFileLocation::SERVER_DISK == file_location_ && OB_FAIL(check_secure_file_path(basic_url_))) { - LOG_WARN("failed to check secure file path", K(ret)); - } - return ret; -} -// csv, odps supports batch and non-batch interfaces; parquet, orc only supports batch interface; non-batch interface will be discontinued later -int ObSelectIntoOp::inner_get_next_row() -{ - int ret = 0 == top_limit_cnt_ ? OB_ITER_END : OB_SUCCESS; - int64_t row_count = 0; - const ObItemType into_type = MY_SPEC.into_type_; - ObPhysicalPlanCtx *phy_plan_ctx = NULL; - ObExternalFileWriter *data_writer = NULL; - if (ObExternalFileFormat::FormatType::CSV_FORMAT != format_type_ - && ObExternalFileFormat::FormatType::ODPS_FORMAT != format_type_) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("this type not supported in not batch interface", K(ret), K(format_type_)); - LOG_USER_ERROR(OB_NOT_SUPPORTED, "this upload type"); - } else if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get phy_plan_ctx failed", K(ret)); - } - //when do_partition is false, create the only data_writer here - if (OB_SUCC(ret) && ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ - && T_INTO_VARIABLES != into_type && !do_partition_ - && OB_FAIL(create_the_only_data_writer(data_writer))) { - LOG_WARN("failed to create the only data writer", K(ret)); - } - while (OB_SUCC(ret) && row_count < top_limit_cnt_) { - clear_evaluated_flag(); - if (OB_FAIL(child_->get_next_row())) { - if (OB_LIKELY(OB_ITER_END == ret)) { - } else { - LOG_WARN("get next row failed", K(ret)); - } - } else { - ++row_count; - if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { - if (is_odps_cpp_table_ == is_odps_java_table_) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid table mode for odps table", K(ret), - K(is_odps_cpp_table_), K(is_odps_java_table_)); - } else if (is_odps_cpp_table_) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps cpp table"); - LOG_WARN("use supported version", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps table"); - LOG_WARN("not support jni odps single write", K(ret)); - } - } else if (T_INTO_VARIABLES == into_type) { - if (OB_FAIL(into_varlist())) { - LOG_WARN("into varlist failed", K(ret)); - } - } else if (T_INTO_OUTFILE == into_type) { - if (OB_FAIL(into_outfile(data_writer))) { - LOG_WARN("into outfile failed", K(ret)); - } - } else { - if (OB_FAIL(into_dumpfile(data_writer))) { - LOG_WARN("into dumpfile failed", K(ret)); - } - } - } - if (OB_SUCC(ret) || OB_ITER_END == ret) { // if into user variables or into dumpfile, must be one row - if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ - && (T_INTO_VARIABLES == into_type || T_INTO_DUMPFILE == into_type) && row_count > 1) { - ret = OB_ERR_TOO_MANY_ROWS; - LOG_WARN("more than one row for into variables or into dumpfile", K(ret), K(row_count)); - } - } - } //end while - if (OB_ITER_END == ret || OB_SUCC(ret)) { // set affected rows - phy_plan_ctx->set_affected_rows(row_count); - } - if (OB_FAIL(ret) && OB_ITER_END != ret) { - need_commit_ = false; - } - return ret; -} - -int ObSelectIntoOp::inner_get_next_batch(const int64_t max_row_cnt) -{ - int ret = OB_SUCCESS; - const ObBatchRows *child_brs = NULL; - int64_t batch_size = min(max_row_cnt, MY_SPEC.max_batch_size_); - int64_t row_count = 0; - const ObItemType into_type = MY_SPEC.into_type_; - ObPhysicalPlanCtx *phy_plan_ctx = NULL; - ObExternalFileWriter *data_writer = NULL; - bool stop_loop = false; - bool is_iter_end = false; - if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get phy_plan_ctx failed", K(ret)); - } - //when do_partition is false, create the only data_writer here - if (OB_SUCC(ret) && T_INTO_VARIABLES != into_type && !do_partition_ - && (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ - || ObExternalFileFormat::FormatType::PARQUET_FORMAT == format_type_ - )) { - if (OB_FAIL(create_the_only_data_writer(data_writer))) { - LOG_WARN("failed to create the only data writer", K(ret)); - } else if (OB_ISNULL(data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } - } - - if (0 == top_limit_cnt_) { - brs_.size_ = 0; - brs_.end_ = true; - stop_loop = true; - } - while (OB_SUCC(ret) && !stop_loop) { - clear_evaluated_flag(); - int64_t rowkey_batch_size = min(batch_size, top_limit_cnt_ - row_count); - if (OB_FAIL(child_->get_next_batch(rowkey_batch_size, child_brs))) { - LOG_WARN("get next batch failed", K(ret)); - } else { - brs_.size_ = child_brs->size_; - brs_.end_ = child_brs->end_; - is_iter_end = brs_.end_ && 0 == brs_.size_; - if (brs_.size_ > 0) { - brs_.skip_->deep_copy(*(child_brs->skip_), brs_.size_); - row_count += brs_.size_ - brs_.skip_->accumulate_bit_cnt(brs_.size_); - if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { - if (!GCONF._use_odps_jni_connector) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps cpp connector is not supported", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps jni connector is not supported", K(ret)); - } - } else if (T_INTO_OUTFILE == into_type) { - if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_) { - if (OB_FAIL(into_outfile_batch_csv(brs_, data_writer))) { - LOG_WARN("csv into outfile batch failed", K(ret)); - } - } else if (ObExternalFileFormat::FormatType::PARQUET_FORMAT == format_type_) { -#ifndef OB_BUILD_EMBED_MODE - if (OB_FAIL(into_outfile_batch_parquet(brs_, data_writer))) { - LOG_WARN("parquet into outfile batch failed", K(ret)); - } -#else - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet is not supported in embed mode", K(ret)); -#endif // OB_BUILD_EMBED_MODE - } else if (ObExternalFileFormat::FormatType::ORC_FORMAT == format_type_) { - ret = OB_NOT_SUPPORTED; - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support to write into outfile format.", K(ret), K(format_type_)); - } - } else { - ObEvalCtx::BatchInfoScopeGuard guard(eval_ctx_); - guard.set_batch_size(brs_.size_); - for (int64_t i = 0; OB_SUCC(ret) && i < brs_.size_; i++) { - if (brs_.skip_->contain(i)) { - continue; - } - guard.set_batch_idx(i); - if (T_INTO_VARIABLES == into_type) { - if (OB_FAIL(into_varlist())) { - LOG_WARN("into varlist failed", K(ret)); - } - } else { - if (OB_FAIL(into_dumpfile(data_writer))) { - LOG_WARN("into dumpfile failed", K(ret)); - } - } - } - } - } - } - if (is_iter_end || row_count >= top_limit_cnt_) { - stop_loop = true; - } - if (OB_SUCC(ret) || is_iter_end) { // if into user variables or into dumpfile, must be one row - if ((T_INTO_VARIABLES == into_type || T_INTO_DUMPFILE == into_type) && row_count > 1) { - ret = OB_ERR_TOO_MANY_ROWS; - LOG_WARN("more than one row for into variables or into dumpfile", K(ret), K(row_count)); - } - } - } //end while - if (OB_SUCC(ret)) { // set affected rows - phy_plan_ctx->set_affected_rows(row_count); - } - if (OB_FAIL(ret)) { - need_commit_ = false; - } - return ret; -} - -int ObSelectIntoOp::inner_rescan() -{ - int ret = OB_SUCCESS; - return ret; -} - -int ObSelectIntoOp::inner_close() -{ - int ret = OB_SUCCESS; - ObExternalFileWriter *data_writer = NULL; - int64_t estimated_bytes = 0; - if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { - if (!GCONF._use_odps_jni_connector) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps jni connector is not supported", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps jni connector is not supported", K(ret)); - } - } else if (do_partition_) { - for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); - OB_SUCC(ret) && iter != partition_map_.end(); iter++) { - if (OB_ISNULL(data_writer = iter->second)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("data writer is unexpected null", K(ret)); - } else if (OB_FAIL(data_writer->close_data_writer())) { - LOG_WARN("failed to close data writer", K(ret)); - } - } - } else if (OB_NOT_NULL(data_writer_) && OB_FAIL(data_writer_->close_data_writer())) { - LOG_WARN("failed to close data writer", K(ret)); - } - return ret; -} - -int ObSelectIntoOp::get_row_str(const int64_t buf_len, - bool is_first_row, - char *buf, - int64_t &pos) -{ - int ret = OB_SUCCESS; - const ObObj &field_str = field_str_; - char closed_cht = char_enclose_; - //before 4_1 use output - //after 4_1 use select exprs - const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? - MY_SPEC.output_ : MY_SPEC.select_exprs_; - if (!is_first_row && line_str_.is_varying_len_char_type()) { // lines terminated by "a" - ret = databuff_printf(buf, buf_len, pos, "%.*s", line_str_.get_varchar().length(), - line_str_.get_varchar().ptr()); - } - - for (int i = 0 ; OB_SUCC(ret) && i < select_exprs.count() ; i++) { - const ObExpr *expr = select_exprs.at(i); - if (0 != closed_cht && (!is_optional_ || ob_is_string_type(expr->datum_meta_.type_))) { - // closed by "a" (for all cell) or optionally by "a" (for string cell) - if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%c", closed_cht))) { - LOG_WARN("print closed character failed", K(ret), K(closed_cht)); - } - } - if (OB_SUCC(ret)) { - ObObj cell; - ObDatum *datum = NULL; - if (OB_FAIL(expr->eval(eval_ctx_, datum))) { - LOG_WARN("expr eval failed", K(ret)); - } else if (OB_FAIL(datum->to_obj(cell, expr->obj_meta_))) { - LOG_WARN("to obj failed", K(ret)); - } else if (OB_FAIL(cell.print_plain_str_literal(buf, buf_len, pos))) { // cell value - LOG_WARN("print sql failed", K(ret), K(cell)); - } else if (0 != closed_cht && (!is_optional_ || ob_is_string_type(expr->datum_meta_.type_))) { - if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%c", closed_cht))) { - LOG_WARN("print closed character failed", K(ret), K(closed_cht)); - } - } - // field terminated by "a" - if (OB_SUCC(ret) && i != select_exprs.count() - 1 && field_str.is_varying_len_char_type()) { - if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%.*s", field_str.get_varchar().length(), field_str.get_varchar().ptr()))) { - LOG_WARN("print field str failed", K(ret), K(field_str)); - } - } - } - } - - return ret; -} - -int ObSelectIntoOp::calc_first_file_path(ObString &path) -{ - int ret = OB_SUCCESS; - ObSqlString file_name_with_suffix; - ObString file_extension; - ObSelectIntoOpInput *input = static_cast(input_); - ObString input_file_name = file_location_ != IntoFileLocation::SERVER_DISK - ? path.split_on('?').trim() - : path; - if (OB_ISNULL(input)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("op input is null", K(ret)); - } else if (input_file_name.length() == 0 || path.length() == 0) { - ret = OB_INVALID_ARGUMENT; - LOG_USER_ERROR(OB_INVALID_ARGUMENT, "invalid outfile path"); - LOG_WARN("invalid outfile path", K(ret)); - } else { - if (input_file_name.ptr()[input_file_name.length() - 1] == '/'){ - OZ(file_name_with_suffix.append_fmt("%.*sdata", input_file_name.length(), input_file_name.ptr())); - } else { - OZ(file_name_with_suffix.append_fmt("%.*s", input_file_name.length(), input_file_name.ptr())); - } - if (MY_SPEC.parallel_ > 1) { - OZ(file_name_with_suffix.append_fmt("_%ld_%ld_%d", input->sqc_id_, input->task_id_, 0)); - } else { - OZ(file_name_with_suffix.append_fmt("_%d", 0)); - } - OZ(external_properties_.get_format_file_extension(format_type_, file_extension)); - if (!file_extension.empty() && file_extension.ptr()[0] != '.') { - OZ(file_name_with_suffix.append(".")); - } - OZ(file_name_with_suffix.append(file_extension)); - if (format_type_ == ObExternalFileFormat::FormatType::CSV_FORMAT) { - OZ(file_name_with_suffix.append(compression_algorithm_to_suffix(external_properties_.csv_format_.compression_algorithm_))); - } - if (file_location_ != IntoFileLocation::SERVER_DISK) { - OZ(file_name_with_suffix.append_fmt("?%.*s", path.length(), path.ptr())); - } - if (OB_SUCC(ret) && OB_FAIL(ob_write_string(ctx_.get_allocator(), file_name_with_suffix.string(), path))) { - LOG_WARN("failed to write string", K(ret)); - } - } - return ret; -} - -int ObSelectIntoOp::calc_next_file_path(ObExternalFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - ObSqlString url_with_suffix; - ObString file_path; - data_writer.split_file_id_++; - if (data_writer.split_file_id_ > 0) { - if (MY_SPEC.is_single_ && IntoFileLocation::SERVER_DISK != file_location_) { - file_path = (data_writer.split_file_id_ > 1) - ? data_writer.url_.split_on(data_writer.url_.reverse_find('.')) - : data_writer.url_; - if (OB_FAIL(url_with_suffix.assign(file_path))) { - LOG_WARN("failed to assign string", K(ret)); - } else if (OB_FAIL(url_with_suffix.append_fmt(".extend%ld", data_writer.split_file_id_))) { - LOG_WARN("failed to append string", K(ret)); - } - } else if (!MY_SPEC.is_single_) { - file_path = data_writer.url_.split_on(data_writer.url_.reverse_find('_')); - if (OB_FAIL(url_with_suffix.assign(file_path))) { - LOG_WARN("failed to assign string", K(ret)); - } else if (OB_FAIL(url_with_suffix.append_fmt("_%ld", data_writer.split_file_id_))) { - LOG_WARN("failed to append string", K(ret)); - } - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected single value", K(ret)); - } - if (!MY_SPEC.is_single_) { - ObString file_extension; - OZ(external_properties_.get_format_file_extension(format_type_, file_extension)); - if (!file_extension.empty() && file_extension.ptr()[0] != '.') { - OZ(url_with_suffix.append(".")); - } - OZ(url_with_suffix.append(file_extension)); - } - if (!MY_SPEC.is_single_ - && format_type_ == ObExternalFileFormat::FormatType::CSV_FORMAT) { - OZ(url_with_suffix.append(compression_algorithm_to_suffix(external_properties_.csv_format_.compression_algorithm_))); - } - if (OB_SUCC(ret) && OB_FAIL(ob_write_string(ctx_.get_allocator(), - url_with_suffix.string(), - data_writer.url_, true))) { - LOG_WARN("failed to write string", K(ret)); - } - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected split file id", K(ret)); - } - return ret; -} -// Set the current data_writer's url_ based on the incoming partition and basic_url_, each partition only needs to be calculated once, subsequent changes only need to modify the split id -int ObSelectIntoOp::calc_file_path_with_partition(ObString partition, ObExternalFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - ObSqlString url_with_partition; - ObString dir_path; - if (OB_FAIL(ob_write_string(ctx_.get_allocator(), basic_url_, data_writer.url_))) { - LOG_WARN("failed to write string", K(ret)); - } else { - dir_path = data_writer.url_.split_on(data_writer.url_.reverse_find('/')); - if (OB_FAIL(url_with_partition.assign(dir_path))) { - LOG_WARN("failed to assign string", K(ret)); - } else if (url_with_partition.length() != 0 && OB_FAIL(url_with_partition.append("/"))) { - LOG_WARN("failed to append string", K(ret)); - } else if (partition.length() != 0 && OB_FAIL(url_with_partition.append_fmt("%.*s/", - partition.length(), - partition.ptr()))) { - LOG_WARN("failed to append string", K(ret)); - } else if (partition.length() == 0 && OB_FAIL(url_with_partition.append("__NULL__/"))) { - LOG_WARN("failed to append string", K(ret)); - } else if (OB_FAIL(url_with_partition.append_fmt("%.*s", - data_writer.url_.length(), - data_writer.url_.ptr()))) { - LOG_WARN("failed to append string", K(ret)); - } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), - url_with_partition.string(), - data_writer.url_, - true))) { - LOG_WARN("failed to write string", K(ret)); - } - } - return ret; -} - -int ObSelectIntoOp::split_file(ObExternalFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_) { - ObCsvFileWriter *csv_data_writer = static_cast(&data_writer); - if (OB_ISNULL(csv_data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null data writer", K(ret)); - } else if (!use_shared_buf_ && OB_FAIL(csv_data_writer->flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (has_lob_ && use_shared_buf_ && OB_FAIL(csv_data_writer->flush_shared_buf(shared_buf_))) { - // To ensure the integrity of each line in the file, when there is a lob, the shared buffer may not contain a complete line - // Therefore the remaining content in the shared buffer also needs to be flushed to the current file, in this case, the max_file_size limit cannot be strictly enforced - LOG_WARN("failed to flush shared buffer", K(ret)); - } - } - if (OB_FAIL(ret)) { - } else if (OB_FAIL(data_writer.close_file())) { - LOG_WARN("failed to close file", K(ret)); - } else if (OB_FAIL(calc_next_file_path(data_writer))) { - LOG_WARN("failed to calculate new file path", K(ret)); - } - return ret; -} - -int ObSelectIntoOp::check_csv_file_size(ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - int64_t curr_bytes = data_writer.get_file_size(); - int64_t curr_bytes_exclude_curr_line = data_writer.get_curr_bytes_exclude_curr_line(); - int64_t curr_line_len = curr_bytes - curr_bytes_exclude_curr_line; - bool has_split = false; - bool has_use_shared_buf = use_shared_buf_; - if (has_compress_ && OB_ISNULL(data_writer.get_compress_stream_writer())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null compress stream writer", K(ret)); - } else if (!(has_lob_ && has_use_shared_buf) && curr_bytes_exclude_curr_line == 0) { - } else if (file_need_split(curr_bytes)) { - if (OB_FAIL(split_file(data_writer))) { - LOG_WARN("failed to split file", K(ret)); - } else { - has_split = true; - } - } - if (OB_SUCC(ret)) { - if (has_lob_ && has_use_shared_buf) { - if (!has_compress_) { - data_writer.set_write_bytes(has_split ? 0 : curr_bytes); - } - data_writer.reset_curr_line_len(); - } else { - if (!has_compress_) { - data_writer.set_write_bytes(has_split ? curr_line_len : curr_bytes); - } - } - if (has_compress_ && has_split) { - data_writer.get_compress_stream_writer()->reuse(); - } - data_writer.update_last_line_pos(); - } - return ret; -} - -int ObSelectIntoOp::get_buf(char* &buf, int64_t &buf_len, int64_t &pos, ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - buf = use_shared_buf_ ? get_shared_buf() : data_writer.get_buf(); - buf_len = use_shared_buf_ ? get_shared_buf_len() : data_writer.get_buf_len(); - pos = data_writer.get_curr_pos(); - if (OB_ISNULL(buf) && !use_shared_buf_ && OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } else if (OB_ISNULL(buf)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("buf should not be null", K(ret)); - } - return ret; -} - -int ObSelectIntoOp::use_shared_buf(ObCsvFileWriter &data_writer, - char* &buf, - int64_t &buf_len, - int64_t &pos) -{ - int ret = OB_SUCCESS; - int64_t curr_pos = data_writer.get_curr_pos(); - if (!use_shared_buf_ && data_writer.get_last_line_pos() == 0) { - if (OB_NOT_NULL(data_writer.get_buf()) && curr_pos > 0) { - MEMCPY(shared_buf_, data_writer.get_buf(), curr_pos); - } - use_shared_buf_ = true; - buf = shared_buf_; - buf_len = shared_buf_len_; - pos = curr_pos; - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("last line should be flushed before this line copied", K(ret)); - } - return ret; -} - -int ObSelectIntoOp::resize_buf(char* &buf, - int64_t &buf_len, - int64_t &pos, - int64_t curr_pos, - bool is_json) -{ - int ret = OB_SUCCESS; - int64_t new_buf_len = buf_len * 2; - char* new_buf = NULL; - if (OB_ISNULL(new_buf = static_cast(ctx_.get_allocator().alloc(new_buf_len)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate buffer", K(ret), K(new_buf_len)); - } else if (!is_json) { - if (curr_pos > 0) { - MEMCPY(new_buf, shared_buf_, curr_pos); - } - shared_buf_ = new_buf; - shared_buf_len_ = new_buf_len; - } else { - json_buf_ = new_buf; - json_buf_len_ = new_buf_len; - } - if (OB_SUCC(ret)) { - buf = new_buf; - buf_len = new_buf_len; - pos = is_json ? 0 : curr_pos; - } - return ret; -} - -int ObSelectIntoOp::resize_or_flush_shared_buf(ObCsvFileWriter &data_writer, - char* &buf, - int64_t &buf_len, - int64_t &pos) -{ - int ret = OB_SUCCESS; - if (!use_shared_buf_) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get invalid argument", K(use_shared_buf_), K(ret)); - } else if (has_lob_ && data_writer.get_curr_pos() > 0) { - if (OB_FAIL(data_writer.flush_shared_buf(shared_buf_, true))) { - LOG_WARN("failed to flush shared buffer", K(ret)); - } else { - pos = 0; - } - } else if (OB_FAIL(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos()))) { - LOG_WARN("failed to resize shared buffer", K(ret)); - } - return ret; -} - -int ObSelectIntoOp::check_buf_sufficient(ObCsvFileWriter &data_writer, - char* &buf, - int64_t &buf_len, - int64_t &pos, - int64_t str_len) -{ - int ret = OB_SUCCESS; - if (buf_len < str_len * 1.1) { - if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - return ret; -} - -int ObSelectIntoOp::write_obj_to_file(const ObObj &obj, ObCsvFileWriter &data_writer, bool need_escape) -{ - int ret = OB_SUCCESS; - // binary collation do not require to escape when encode with base64/hex - if (obj.get_collation_type() == CS_TYPE_BINARY && - (print_params_.binary_string_print_hex_ || print_params_.binary_string_print_base64_)) { - need_escape = false; - } - - if ((obj.is_string_type() || obj.is_json() || obj.is_collection_sql_type()) && need_escape) { - if (OB_FAIL(print_str_or_json_with_escape(obj, data_writer))) { - LOG_WARN("failed to print str or json with escape", K(ret)); - } - } else if (OB_FAIL(print_normal_obj_without_escape(obj, data_writer))) { - LOG_WARN("failed to print normal obj without escape", K(ret)); - } - return ret; -} - -int ObSelectIntoOp::print_str_or_json_with_escape(const ObObj &obj, ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - char* buf = NULL; - int64_t buf_len = 0; - int64_t pos = 0; - ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); - ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); - escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type - || src_type == CHARSET_INVALID); - escape_printer_.need_enclose_ = has_enclose_ && !obj.is_null(); - escape_printer_.do_escape_ = true; - escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY - && print_params_.binary_string_print_hex_; - ObString str_to_escape; - ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); - common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); - if (OB_FAIL(get_buf(escape_printer_.buf_, escape_printer_.buf_len_, escape_printer_.pos_, data_writer))) { - LOG_WARN("failed to get buffer", K(ret)); - } else if (obj.is_json() || obj.is_collection_sql_type()) { - ObObj inrow_obj = obj; - if (obj.is_lob_storage() - && OB_FAIL(ObTextStringIter::convert_outrow_lob_to_inrow_templob(obj, inrow_obj, NULL, &temp_allocator))) { - LOG_WARN("failed to convert outrow lobs", K(ret), K(obj)); - } else if (obj.is_collection_sql_type()) { - ObSubSchemaValue sub_meta; - if (OB_FAIL((get_exec_ctx().get_sqludt_meta_by_subschema_id(obj.get_meta().get_subschema_id(), sub_meta)))) { - LOG_WARN("failed to get collection subschema", K(ret), K(obj.get_meta().get_subschema_id())); - } else { - print_params_.coll_meta_ = reinterpret_cast(sub_meta.value_); - } - } - if (OB_FAIL(ret)) { - } else if (OB_FAIL(print_json_to_json_buf(inrow_obj, buf, buf_len, pos, data_writer))) { - LOG_WARN("failed to print normal obj without escape", K(ret)); - } else { - str_to_escape.assign_ptr(buf, pos); - escape_printer_.do_encode_ = false; - } - } else { - str_to_escape = obj.get_varchar(); - } - if (OB_SUCC(ret) && !use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_, - str_to_escape.length()))) { - LOG_WARN("failed to check if buf is sufficient", K(ret)); - } - if (OB_SUCC(ret) && !use_shared_buf_) { - if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_))) { - if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); - } else if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { - } else if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_))) { - if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); - } else if (OB_FAIL(use_shared_buf(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - } - } - if (OB_SUCC(ret) && use_shared_buf_) { - do { - if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_))) { - LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); - } - } while (OB_SIZE_OVERFLOW == ret && OB_SUCC(resize_or_flush_shared_buf(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_))); - if (OB_FAIL(ret)) { - LOG_WARN("failed to print plain str", K(ret)); - } - } - if (OB_SUCC(ret)) { - data_writer.set_curr_pos(escape_printer_.pos_); - } - - return ret; -} - -int ObSelectIntoOp::print_normal_obj_without_escape(const ObObj &obj, ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - char* buf = NULL; - int64_t buf_len = 0; - int64_t pos = 0; - OZ(get_buf(buf, buf_len, pos, data_writer)); - if (OB_SUCC(ret) && !use_shared_buf_) { - if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { - if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print obj", K(ret)); - } else if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FALSE_IT(pos = data_writer.get_curr_pos())) { - } else if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { - if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print obj", K(ret)); - } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - } - } - if (OB_SUCC(ret) && use_shared_buf_) { - do { - if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { - LOG_WARN("failed to print obj", K(ret)); - } - } while (OB_SIZE_OVERFLOW == ret - && OB_SUCC(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))); - if (OB_FAIL(ret)) { - LOG_WARN("failed to print obj", K(ret)); - } - } - if (OB_SUCC(ret)) { - data_writer.set_curr_pos(pos); - } - return ret; -} - -int ObSelectIntoOp::print_json_to_json_buf(const ObObj &obj, - char* &buf, - int64_t &buf_len, - int64_t &pos, - ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - buf = get_json_buf(); - buf_len = get_json_buf_len(); - pos = 0; - do { - if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { - LOG_WARN("failed to print obj", K(ret)); - } - } while (OB_SIZE_OVERFLOW == ret - && OB_SUCC(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos(), true))); - if (OB_FAIL(ret)) { - LOG_WARN("failed to print json to json buffer", K(ret)); - } - return ret; -} - -int ObSelectIntoOp::write_lob_to_file(const ObObj &obj, - const ObExpr &expr, - const ObDatum &datum, - ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); - ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); - escape_printer_.need_enclose_ = has_enclose_; - escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type - || src_type == CHARSET_INVALID); - escape_printer_.do_escape_ = has_escape_; - escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY - && print_params_.binary_string_print_hex_; - ObDatumMeta input_meta = expr.datum_meta_; - ObTextStringIterState state; - ObString src_block_data; - ObTextStringIter lob_iter(input_meta.type_, input_meta.cs_type_, datum.get_string(), - expr.obj_meta_.has_lob_header()); - ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); - common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); - int64_t truncated_len = 0; - bool stop_when_truncated = false; - OZ(lob_iter.init(0, NULL, &temp_allocator)); - OZ(get_buf(escape_printer_.buf_, escape_printer_.buf_len_, escape_printer_.pos_, data_writer)); - // When truncated_len == src_block_data.length() when truncated length equals source block data length - // Indicates that the current foreach_char is processing only invalid data at the end of the lob, i.e., truncated data from the previous round, to avoid infinite loops - while (OB_SUCC(ret) - && (state = lob_iter.get_next_block(src_block_data)) == TEXTSTRING_ITER_NEXT) { - // outrow lob will only be false on the last iteration, inrow lob iterates only once, and is false - stop_when_truncated = (truncated_len != src_block_data.length()) && lob_iter.is_outrow_lob(); - if (!use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_, - src_block_data.length()))) { - LOG_WARN("failed to check if buf is sufficient", K(ret)); - } - if (OB_SUCC(ret) && !use_shared_buf_) { - if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { - if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { - lob_iter.set_reserved_byte_len(truncated_len); - ret = OB_SUCCESS; - } else if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print lob", K(ret)); - } else if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { - } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { - if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { - lob_iter.set_reserved_byte_len(truncated_len); - ret = OB_SUCCESS; - } else if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print lob", K(ret)); - } else if (OB_FAIL(use_shared_buf(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - } - } - if (OB_SUCC(ret) && use_shared_buf_) { - if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { - if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { - lob_iter.set_reserved_byte_len(truncated_len); - ret = OB_SUCCESS; - } else if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print lob", K(ret)); - } else if (OB_FAIL(data_writer.flush_shared_buf(shared_buf_, true))) { - LOG_WARN("failed to flush shared buffer", K(ret)); - } else if (OB_FALSE_IT(escape_printer_.pos_ = 0)) { - } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { - if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { - lob_iter.set_reserved_byte_len(truncated_len); - ret = OB_SUCCESS; - } else { - LOG_WARN("failed to print lob", K(ret), K(src_block_data.length()), K(shared_buf_len_), - K(data_writer.get_curr_pos()), K(escape_printer_.buf_len_), K(escape_printer_.pos_)); - } - } - } - } - data_writer.set_curr_pos(escape_printer_.pos_); - } - if (OB_FAIL(ret)) { - } else if (state != TEXTSTRING_ITER_NEXT && state != TEXTSTRING_ITER_END) { - ret = (lob_iter.get_inner_ret() != OB_SUCCESS) ? - lob_iter.get_inner_ret() : OB_INVALID_DATA; - LOG_WARN("iter state invalid", K(ret), K(state), K(lob_iter)); - } - return ret; -} - -int ObSelectIntoOp::write_single_char_to_file(const char *wchar, ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - char* buf = NULL; - int64_t buf_len = 0; - int64_t pos = 0; - OZ(get_buf(buf, buf_len, pos, data_writer)); - if (OB_SUCC(ret) && !use_shared_buf_) { - if (pos < buf_len) { - MEMCPY(buf + pos, wchar, 1); - data_writer.set_curr_pos(pos + 1); - } else if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FALSE_IT(pos = data_writer.get_curr_pos())) { - } else if (pos < buf_len) { - MEMCPY(buf + pos, wchar, 1); - data_writer.set_curr_pos(pos + 1); - } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - if (OB_SUCC(ret) && use_shared_buf_) { - if (pos < buf_len) { - MEMCPY(buf + pos, wchar, 1); - data_writer.set_curr_pos(pos + 1); - } else if (OB_FAIL(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to resize or flush shared buffer", K(ret)); - } else if (pos < buf_len) { - MEMCPY(buf + pos, wchar, 1); - data_writer.set_curr_pos(pos + 1); - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected error", K(ret)); - } - } - return ret; -} - -int ObSelectIntoOp::print_lob_field(const ObObj &obj, - const ObExpr &expr, - const ObDatum &datum, - ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - if (has_enclose_) { - OZ(write_single_char_to_file(&char_enclose_, data_writer)); - } - OZ(write_lob_to_file(obj, expr, datum, data_writer)); - if (has_enclose_) { - OZ(write_single_char_to_file(&char_enclose_, data_writer)); - } - return ret; -} - -int ObSelectIntoOp::print_field(const ObObj &obj, ObCsvFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - char char_n = 'N'; - const bool need_enclose = has_enclose_ && !obj.is_null() - && (!is_optional_ || obj.is_string_type() || obj.is_collection_sql_type() - || obj.is_json() || obj.is_geometry() || obj.is_date() - || obj.is_time() || obj.is_timestamp() || obj.is_datetime() - || obj.is_mysql_date() || obj.is_mysql_datetime()); - if (need_enclose) { - OZ(write_single_char_to_file(&char_enclose_, data_writer)); - } - if (!has_escape_) { - OZ(write_obj_to_file(obj, data_writer, false)); - } else if (obj.is_null()) { - OZ(write_single_char_to_file(&char_escape_, data_writer)); - OZ(write_single_char_to_file(&char_n, data_writer)); - } else { - OZ(write_obj_to_file(obj, data_writer, true)); - } - if (need_enclose) { - OZ(write_single_char_to_file(&char_enclose_, data_writer)); - } - return ret; -} - -int ObSelectIntoOp::into_outfile(ObExternalFileWriter *data_writer) -{ - int ret = OB_SUCCESS; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - ObDatum *datum = NULL; - ObObj obj; - ObDatum *partition_datum = NULL; - ObCsvFileWriter *csv_data_writer = NULL; - if (do_partition_) { - if (OB_FAIL(MY_SPEC.file_partition_expr_->eval(eval_ctx_, partition_datum))) { - LOG_WARN("eval expr failed", K(ret)); - } else if (OB_ISNULL(partition_datum)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (OB_FAIL(get_data_writer_for_partition(partition_datum->get_string(), data_writer))) { - LOG_WARN("failed to set data writer for partition", K(ret)); - } - } - if (OB_SUCC(ret)) { - if (OB_ISNULL(csv_data_writer = static_cast(data_writer))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null data writer", K(ret)); - } - } - for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { - if (OB_ISNULL(select_exprs.at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("select expr is unexpected null", K(ret)); - } else if (OB_FAIL(select_exprs.at(i)->eval(eval_ctx_, datum))) { - LOG_WARN("eval expr failed", K(ret)); - } else if (OB_ISNULL(datum)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("datum is unexpected null", K(ret)); - } else if (OB_FAIL(datum->to_obj(obj, - select_exprs.at(i)->obj_meta_, - select_exprs.at(i)->obj_datum_map_))) { - LOG_WARN("failed to get obj from datum", K(ret)); - } else if (!ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type()) || obj.is_null()) { - OZ(print_field(obj, *csv_data_writer)); - } else { // text tc - OZ(print_lob_field(obj, *select_exprs.at(i), *datum, *csv_data_writer)); - } - // print field terminator - if (OB_SUCC(ret) && i != select_exprs.count() - 1) { - OZ(write_obj_to_file(field_str_, *csv_data_writer)); - } - } - // print line terminator - OZ(write_obj_to_file(line_str_, *csv_data_writer)); - // check if need split file - OZ(check_csv_file_size(*csv_data_writer)); - // clear shared buffer - OZ(csv_data_writer->flush_shared_buf(shared_buf_)); - if (has_compress_) { - OZ(csv_data_writer->flush_buf()); - } - return ret; -} - -static OB_INLINE int get_cast_ret(const bool is_strict_mode, int ret) -{ - if (OB_SUCCESS != ret && !is_strict_mode) { - ret = OB_SUCCESS; - } - return ret; -} - -#define CAST_FAIL(stmt) \ - (OB_UNLIKELY((OB_SUCCESS != (ret = get_cast_ret((is_strict_mode), (stmt)))))) - - -int ObSelectIntoOp::decimal_to_string(const ObDatum &datum, - const ObDatumMeta &datum_meta, - std::string &res, - ObIAllocator &allocator) -{ - int ret = OB_SUCCESS; - char *buf = NULL; - int64_t pos = 0; - if (OB_ISNULL(buf = static_cast(allocator.alloc(OB_CAST_TO_VARCHAR_MAX_LENGTH)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to alloc memory", K(ret)); - } else if (OB_FAIL(wide::to_string(datum.get_decimal_int(), datum.get_int_bytes(), datum_meta.scale_, - buf, OB_CAST_TO_VARCHAR_MAX_LENGTH, pos))) { - LOG_WARN("failed to get string", K(ret)); - } else { - res.assign(buf, pos); - } - return ret; -} - - -int ObSelectIntoOp::into_outfile_batch_csv(const ObBatchRows &brs, ObExternalFileWriter *data_writer) -{ - int ret = OB_SUCCESS; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - ObArray datum_vectors; - ObDatum *datum = NULL; - ObObj obj; - ObDatumVector partition_datum_vector; - ObCsvFileWriter *csv_data_writer = NULL; - for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { - if (OB_FAIL(select_exprs.at(i)->eval_batch(eval_ctx_, *brs.skip_, brs.size_))) { - LOG_WARN("failed to eval batch", K(ret)); - } else if (OB_FAIL(datum_vectors.push_back(select_exprs.at(i)->locate_expr_datumvector(eval_ctx_)))) { - LOG_WARN("failed to push back datum vector", K(ret)); - } - } - - if (OB_SUCC(ret) && do_partition_) { - if (OB_FAIL(MY_SPEC.file_partition_expr_->eval_batch(eval_ctx_, *brs.skip_, brs.size_))) { - LOG_WARN("failed to eval batch", K(ret)); - } else { - partition_datum_vector = MY_SPEC.file_partition_expr_->locate_expr_datumvector(eval_ctx_); - } - } - for (int64_t i = 0; OB_SUCC(ret) && i < brs.size_; ++i) { - if (brs.skip_->contain(i)) { - // do nothing - } else if (do_partition_ && OB_ISNULL(partition_datum_vector.at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (do_partition_ && OB_FAIL(get_data_writer_for_partition(partition_datum_vector.at(i)->get_string(), - data_writer))) { - LOG_WARN("failed to set data writer for partition", K(ret)); - } else if (OB_ISNULL(csv_data_writer = static_cast(data_writer))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null data writer", K(ret)); - } else if (has_compress_ && OB_ISNULL(csv_data_writer->get_compress_stream_writer()) - && OB_FAIL(csv_data_writer->init_compress_writer(ctx_.get_allocator(), - external_properties_.csv_format_.compression_algorithm_, - MY_SPEC.buffer_size_))) { - LOG_WARN("failed to init compress stream writer", K(ret)); - } else { - for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); ++col_idx) { - if (OB_ISNULL(datum = datum_vectors.at(col_idx).at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("datum is unexpected null", K(ret)); - } else if (OB_FAIL(datum->to_obj(obj, - select_exprs.at(col_idx)->obj_meta_, - select_exprs.at(col_idx)->obj_datum_map_))) { - LOG_WARN("failed to get obj from datum", K(ret)); - } else if (!ob_is_text_tc(select_exprs.at(col_idx)->obj_meta_.get_type()) || obj.is_null()) { - OZ(print_field(obj, *csv_data_writer)); - } else { // text tc - OZ(print_lob_field(obj, *select_exprs.at(col_idx), *datum, *csv_data_writer)); - } - // print field terminator - if (OB_SUCC(ret) && col_idx != select_exprs.count() - 1) { - OZ(write_obj_to_file(field_str_, *csv_data_writer)); - } - } - // print line terminator - OZ(write_obj_to_file(line_str_, *csv_data_writer)); - // check if need split file - OZ(check_csv_file_size(*csv_data_writer)); - // clear shared buffer - OZ(csv_data_writer->flush_shared_buf(shared_buf_)); - if (has_compress_) { - OZ(csv_data_writer->flush_buf()); - } - } - } - return ret; -} - -int ObSelectIntoOp::get_data_from_expr_vector(const common::ObIVector* expr_vector, - int row_idx, - ObObjType type, - int64_t &value, - const bool is_strict_mode, - const ObDateSqlMode date_sql_mode) -{ - int ret = OB_SUCCESS; - int32_t date; - switch(type) { - case ObTinyIntType: - value = expr_vector->get_tinyint(row_idx); - break; - case ObSmallIntType: - value = expr_vector->get_smallint(row_idx); - break; - case ObMediumIntType: - value = expr_vector->get_mediumint(row_idx); - break; - case ObInt32Type: - value = expr_vector->get_int32(row_idx); - break; - case ObIntType: - value = expr_vector->get_int(row_idx); - break; - case ObYearType: - value = expr_vector->get_year(row_idx); - break; - case ObDateType: - value = expr_vector->get_date(row_idx); - break; - case ObMySQLDateType: - CAST_FAIL( - ObTimeConverter::mdate_to_date(expr_vector->get_mysql_date(row_idx), date, date_sql_mode)); - value = date; - break; - case ObMySQLDateTimeType: - CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(expr_vector->get_mysql_datetime(row_idx), value, - date_sql_mode)); - break; - default: - ret = OB_OBJ_TYPE_ERROR; - } - return ret; -} - -bool ObSelectIntoOp::file_need_split(int64_t file_size) -{ - return (file_location_ == IntoFileLocation::SERVER_DISK - && !MY_SPEC.is_single_ && file_size > MY_SPEC.max_file_size_) - || (file_location_ != IntoFileLocation::SERVER_DISK - && ((!MY_SPEC.is_single_ && file_size > min(MY_SPEC.max_file_size_, MAX_OSS_FILE_SIZE)) - || (MY_SPEC.is_single_ && file_size > MAX_OSS_FILE_SIZE))); -} - -int ObSelectIntoOp::check_oracle_number(ObObjType obj_type, int16_t &precision, int8_t scale) -{ - int ret = OB_SUCCESS; - return ret; -} - -int ObSelectIntoOp::calc_byte_array(const common::ObIVector* expr_vector, - int row_idx, - const ObDatumMeta &datum_meta, - const ObObjMeta &obj_meta, - ObIAllocator &allocator, + #define USING_LOG_PREFIX SQL_ENG + + #include + #include + + #include "ob_select_into_op.h" + #include "sql/engine/cmd/ob_variable_set_executor.h" + #include "lib/charset/ob_charset_string_helper.h" + #include "sql/engine/px/ob_px_sqc_handler.h" + #include "sql/engine/expr/ob_expr_json_func_helper.h" + #include "lib/udt/ob_collection_type.h" + #include "share/config/ob_server_config.h" + + #ifndef OB_BUILD_EMBED_MODE + #include + #include + #include + #include + #include + #include + #include + + #define ARROW_FAIL(statement) (OB_UNLIKELY(!(statement).ok())) + + #endif + + namespace oceanbase + { + using namespace common; + namespace sql + { + + OB_SERIALIZE_MEMBER(ObSelectIntoOpInput, task_id_, sqc_id_); + OB_SERIALIZE_MEMBER((ObSelectIntoSpec, ObOpSpec), into_type_, user_vars_, outfile_name_, + field_str_, // FARM COMPAT WHITELIST FOR filed_str_: renamed + line_str_, closed_cht_, is_optional_, select_exprs_, is_single_, max_file_size_, + escaped_cht_, cs_type_, parallel_, file_partition_expr_, buffer_size_, is_overwrite_, + external_properties_, external_partition_, alias_names_); + + + int ObSelectIntoOp::inner_open() + { + int ret = OB_SUCCESS; + ObSQLSessionInfo *session = NULL; + if (OB_ISNULL(session = ctx_.get_my_session())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get session failed", K(ret)); + } else { + // since we call get_next_row in inner_open, we have to set opened_ first in avoid to a infinite loop. + opened_ = true; + if (OB_FAIL(session->get_sql_select_limit(top_limit_cnt_))) { + LOG_WARN("fail tp get sql select limit", K(ret)); + } + } + if (OB_SUCC(ret) && !MY_SPEC.external_properties_.str_.empty()) { + if (OB_FAIL(external_properties_.load_from_string(MY_SPEC.external_properties_.str_, + ctx_.get_allocator()))) { + LOG_WARN("failed to load external properties", K(ret)); + } else { + format_type_ = external_properties_.format_type_; + } + } + if (OB_SUCC(ret)) { + switch (format_type_) + { + case ObExternalFileFormat::FormatType::CSV_FORMAT: + { + if (OB_FAIL(init_csv_env())) { + LOG_WARN("failed to init csv env", K(ret)); + } + break; + } + case ObExternalFileFormat::FormatType::ODPS_FORMAT: + { + if (!GCONF._use_odps_jni_connector) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support odps format", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support odps format", K(ret)); + } + break; + } + case ObExternalFileFormat::FormatType::PARQUET_FORMAT: + { + #ifndef OB_BUILD_EMBED_MODE + if (OB_FAIL(init_parquet_env())) { + LOG_WARN("failed to init parquet env", K(ret)); + } + #endif + break; + } + case ObExternalFileFormat::FormatType::ORC_FORMAT: + { + ret = OB_NOT_SUPPORTED; + break; + } + default: + { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support select into type", K(format_type_)); + } + } + } + return ret; + } + + int ObSelectIntoOp::init_csv_env() + { + int ret = OB_SUCCESS; + ObSQLSessionInfo *session = NULL; + set_csv_format_options(); + if (OB_ISNULL(session = ctx_.get_my_session())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get session failed", K(ret)); + } else if (OB_FAIL(init_env_common())) { + LOG_WARN("failed to init env common", K(ret)); + } else if (OB_FAIL(prepare_escape_printer())) { + LOG_WARN("failed to calc escape info", K(ret)); + } else { + if (external_properties_.csv_format_.compression_algorithm_ != CsvCompressType::NONE) { + has_compress_ = true; + } + // setup binary output format for bit/binary + switch (external_properties_.csv_format_.binary_format_) { + case ObCSVGeneralFormat::ObCSVBinaryFormat::DEFAULT: + print_params_.binary_string_print_hex_ = false; + break; + case ObCSVGeneralFormat::ObCSVBinaryFormat::HEX: + print_params_.binary_string_print_hex_ = true; + break; + case ObCSVGeneralFormat::ObCSVBinaryFormat::BASE64: + print_params_.binary_string_print_base64_ = true; + break; + default: + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to set csv binary output format", K(ret)); + } + print_params_.tz_info_ = session->get_timezone_info(); + print_params_.use_memcpy_ = true; + print_params_.cs_type_ = cs_type_; + } + //create buffer + if (OB_SUCC(ret) && T_INTO_OUTFILE == MY_SPEC.into_type_ && OB_FAIL(create_shared_buffer_for_data_writer())) { + LOG_WARN("failed to create buffer for data writer", K(ret)); + } + return ret; + } + + void ObSelectIntoOp::set_csv_format_options() + { + if (MY_SPEC.external_properties_.str_.empty()) { + field_str_ = MY_SPEC.field_str_; + line_str_ = MY_SPEC.line_str_; + has_enclose_ = MY_SPEC.closed_cht_.get_val_len() > 0; + char_enclose_ = has_enclose_ ? MY_SPEC.closed_cht_.get_char().ptr()[0] : 0; + is_optional_ = MY_SPEC.is_optional_; + has_escape_ = MY_SPEC.escaped_cht_.get_val_len() > 0; + char_escape_ = has_escape_ ? MY_SPEC.escaped_cht_.get_char().ptr()[0] : 0; + cs_type_ = MY_SPEC.cs_type_; + } else { + is_optional_ = external_properties_.csv_format_.is_optional_; + cs_type_ = ObCharset::get_default_collation(external_properties_.csv_format_.cs_type_); + field_str_.set_varchar(external_properties_.csv_format_.field_term_str_); + field_str_.set_collation_type(cs_type_); + line_str_.set_varchar(external_properties_.csv_format_.line_term_str_); + line_str_.set_collation_type(cs_type_); + if (external_properties_.csv_format_.field_enclosed_char_ == INT64_MAX) { // null + has_enclose_ = false; + char_enclose_ = 0; + } else { + has_enclose_ = true; + char_enclose_ = external_properties_.csv_format_.field_enclosed_char_; + } + if (external_properties_.csv_format_.field_escaped_char_ == INT64_MAX) { // null + has_escape_ = false; + char_escape_ = 0; + } else { + has_escape_ = true; + char_escape_ = external_properties_.csv_format_.field_escaped_char_; + } + } + } + + int ObSelectIntoOp::init_env_common() + { + int ret = OB_SUCCESS; + ObPhysicalPlanCtx *phy_plan_ctx = NULL; + bool need_check = false; + file_name_ = MY_SPEC.outfile_name_; + do_partition_ = MY_SPEC.file_partition_expr_ == NULL ? false : true; + if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get phy_plan_ctx failed", K(ret)); + } else if (OB_FAIL(ObSQLUtils::get_param_value(MY_SPEC.outfile_name_, + phy_plan_ctx->get_param_store(), + file_name_, + need_check))) { + LOG_WARN("get param value failed", K(ret)); + } else if (OB_FAIL(calc_url_and_set_access_info())) { + LOG_WARN("failed to calc basic url and set device handle", K(ret)); + } else if (OB_FAIL(check_has_lob_or_json())) { + LOG_WARN("failed to check has lob", K(ret)); + } else if (has_coll_ && MY_SPEC.into_type_ == T_INTO_VARIABLES) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "select array/map into variables"); + } else if (do_partition_ + && OB_FAIL(partition_map_.create(128, ObLabel("SelectInto"), ObLabel("SelectInto"), MTL_ID()))) { + LOG_WARN("failed to create hashmap", K(ret)); + } else if (MY_SPEC.select_exprs_.count() != MY_SPEC.alias_names_.strs_.count()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected column count", K(MY_SPEC.select_exprs_.count()), + K(MY_SPEC.alias_names_.strs_.count()), K(ret)); + } + return ret; + } + + //calc first data_writer.url_ and basic_url_ + int ObSelectIntoOp::calc_url_and_set_access_info() + { + int ret = OB_SUCCESS; + const ObItemType into_type = MY_SPEC.into_type_; + ObString path = file_name_.get_varchar().trim(); + if (path.prefix_match_ci(OB_S3_PREFIX)) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "S3 storage"); + LOG_WARN("S3 storage is not supported", K(ret)); + } else if (path.prefix_match_ci(OB_AZBLOB_PREFIX)) { + file_location_ = IntoFileLocation::REMOTE_AZBLOB; + } else if (path.prefix_match_ci(OB_OSS_PREFIX)) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "OSS storage"); + LOG_WARN("OSS storage is not supported", K(ret)); + } else if (path.prefix_match_ci(OB_COS_PREFIX)) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "COS storage"); + LOG_WARN("COS storage is not supported", K(ret)); + } else if (path.prefix_match_ci(OB_HDFS_PREFIX)) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "HDFS storage"); + LOG_WARN("HDFS storage is not supported", K(ret)); + } else { + file_location_ = IntoFileLocation::SERVER_DISK; + } + if (file_location_ == IntoFileLocation::SERVER_DISK && do_partition_) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support partition option on server disk", K(ret)); + LOG_USER_ERROR(OB_NOT_SUPPORTED, "partition option on server disk"); + } else if (T_INTO_OUTFILE == into_type && !MY_SPEC.is_single_ && OB_FAIL(calc_first_file_path(path))) { + LOG_WARN("failed to calc first file path", K(ret)); + } else if (file_location_ != IntoFileLocation::SERVER_DISK) { + ObString temp_url = path.split_on('?'); + temp_url.trim(); + ObString storage_info; + if (OB_FAIL(ob_write_string(ctx_.get_allocator(), temp_url, basic_url_, true))) { + LOG_WARN("failed to append string", K(ret)); + } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), path, storage_info, true))) { + LOG_WARN("failed to append string", K(ret)); + } else if (OB_FAIL(access_info_.set(basic_url_.ptr(), storage_info.ptr()))) { + LOG_WARN("failed to set access info", K(ret), K(path)); + } else if (basic_url_.empty() || !access_info_.is_valid()) { + ret = OB_FILE_NOT_EXIST; + LOG_WARN("file path not exist", K(ret), K(basic_url_), K(access_info_)); + } + } else { // IntoFileLocation::SERVER_DISK + if (OB_FAIL(ob_write_string(ctx_.get_allocator(), path, basic_url_, true))) { + LOG_WARN("failed to write string", K(ret)); + } + } + if (OB_SUCC(ret) && (T_INTO_OUTFILE == into_type || T_INTO_DUMPFILE == into_type) + && IntoFileLocation::SERVER_DISK == file_location_ && OB_FAIL(check_secure_file_path(basic_url_))) { + LOG_WARN("failed to check secure file path", K(ret)); + } + return ret; + } + // csv, odps supports batch and non-batch interfaces; parquet, orc only supports batch interface; non-batch interface will be discontinued later + int ObSelectIntoOp::inner_get_next_row() + { + int ret = 0 == top_limit_cnt_ ? OB_ITER_END : OB_SUCCESS; + int64_t row_count = 0; + const ObItemType into_type = MY_SPEC.into_type_; + ObPhysicalPlanCtx *phy_plan_ctx = NULL; + ObExternalFileWriter *data_writer = NULL; + if (ObExternalFileFormat::FormatType::CSV_FORMAT != format_type_ + && ObExternalFileFormat::FormatType::ODPS_FORMAT != format_type_) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("this type not supported in not batch interface", K(ret), K(format_type_)); + LOG_USER_ERROR(OB_NOT_SUPPORTED, "this upload type"); + } else if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get phy_plan_ctx failed", K(ret)); + } + //when do_partition is false, create the only data_writer here + if (OB_SUCC(ret) && ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ + && T_INTO_VARIABLES != into_type && !do_partition_ + && OB_FAIL(create_the_only_data_writer(data_writer))) { + LOG_WARN("failed to create the only data writer", K(ret)); + } + while (OB_SUCC(ret) && row_count < top_limit_cnt_) { + clear_evaluated_flag(); + if (OB_FAIL(child_->get_next_row())) { + if (OB_LIKELY(OB_ITER_END == ret)) { + } else { + LOG_WARN("get next row failed", K(ret)); + } + } else { + ++row_count; + if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { + if (is_odps_cpp_table_ == is_odps_java_table_) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid table mode for odps table", K(ret), + K(is_odps_cpp_table_), K(is_odps_java_table_)); + } else if (is_odps_cpp_table_) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps cpp table"); + LOG_WARN("use supported version", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps table"); + LOG_WARN("not support jni odps single write", K(ret)); + } + } else if (T_INTO_VARIABLES == into_type) { + if (OB_FAIL(into_varlist())) { + LOG_WARN("into varlist failed", K(ret)); + } + } else if (T_INTO_OUTFILE == into_type) { + if (OB_FAIL(into_outfile(data_writer))) { + LOG_WARN("into outfile failed", K(ret)); + } + } else { + if (OB_FAIL(into_dumpfile(data_writer))) { + LOG_WARN("into dumpfile failed", K(ret)); + } + } + } + if (OB_SUCC(ret) || OB_ITER_END == ret) { // if into user variables or into dumpfile, must be one row + if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ + && (T_INTO_VARIABLES == into_type || T_INTO_DUMPFILE == into_type) && row_count > 1) { + ret = OB_ERR_TOO_MANY_ROWS; + LOG_WARN("more than one row for into variables or into dumpfile", K(ret), K(row_count)); + } + } + } //end while + if (OB_ITER_END == ret || OB_SUCC(ret)) { // set affected rows + phy_plan_ctx->set_affected_rows(row_count); + } + if (OB_FAIL(ret) && OB_ITER_END != ret) { + need_commit_ = false; + } + return ret; + } + + int ObSelectIntoOp::inner_get_next_batch(const int64_t max_row_cnt) + { + int ret = OB_SUCCESS; + const ObBatchRows *child_brs = NULL; + int64_t batch_size = min(max_row_cnt, MY_SPEC.max_batch_size_); + int64_t row_count = 0; + const ObItemType into_type = MY_SPEC.into_type_; + ObPhysicalPlanCtx *phy_plan_ctx = NULL; + ObExternalFileWriter *data_writer = NULL; + bool stop_loop = false; + bool is_iter_end = false; + if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get phy_plan_ctx failed", K(ret)); + } + //when do_partition is false, create the only data_writer here + if (OB_SUCC(ret) && T_INTO_VARIABLES != into_type && !do_partition_ + && (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ + || ObExternalFileFormat::FormatType::PARQUET_FORMAT == format_type_ + )) { + if (OB_FAIL(create_the_only_data_writer(data_writer))) { + LOG_WARN("failed to create the only data writer", K(ret)); + } else if (OB_ISNULL(data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } + } + + if (0 == top_limit_cnt_) { + brs_.size_ = 0; + brs_.end_ = true; + stop_loop = true; + } + while (OB_SUCC(ret) && !stop_loop) { + clear_evaluated_flag(); + int64_t rowkey_batch_size = min(batch_size, top_limit_cnt_ - row_count); + if (OB_FAIL(child_->get_next_batch(rowkey_batch_size, child_brs))) { + LOG_WARN("get next batch failed", K(ret)); + } else { + brs_.size_ = child_brs->size_; + brs_.end_ = child_brs->end_; + is_iter_end = brs_.end_ && 0 == brs_.size_; + if (brs_.size_ > 0) { + brs_.skip_->deep_copy(*(child_brs->skip_), brs_.size_); + row_count += brs_.size_ - brs_.skip_->accumulate_bit_cnt(brs_.size_); + if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { + if (!GCONF._use_odps_jni_connector) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps cpp connector is not supported", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps jni connector is not supported", K(ret)); + } + } else if (T_INTO_OUTFILE == into_type) { + if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_) { + if (OB_FAIL(into_outfile_batch_csv(brs_, data_writer))) { + LOG_WARN("csv into outfile batch failed", K(ret)); + } + } else if (ObExternalFileFormat::FormatType::PARQUET_FORMAT == format_type_) { + #ifndef OB_BUILD_EMBED_MODE + if (OB_FAIL(into_outfile_batch_parquet(brs_, data_writer))) { + LOG_WARN("parquet into outfile batch failed", K(ret)); + } + #else + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet is not supported in embed mode", K(ret)); + #endif // OB_BUILD_EMBED_MODE + } else if (ObExternalFileFormat::FormatType::ORC_FORMAT == format_type_) { + ret = OB_NOT_SUPPORTED; + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support to write into outfile format.", K(ret), K(format_type_)); + } + } else { + ObEvalCtx::BatchInfoScopeGuard guard(eval_ctx_); + guard.set_batch_size(brs_.size_); + for (int64_t i = 0; OB_SUCC(ret) && i < brs_.size_; i++) { + if (brs_.skip_->contain(i)) { + continue; + } + guard.set_batch_idx(i); + if (T_INTO_VARIABLES == into_type) { + if (OB_FAIL(into_varlist())) { + LOG_WARN("into varlist failed", K(ret)); + } + } else { + if (OB_FAIL(into_dumpfile(data_writer))) { + LOG_WARN("into dumpfile failed", K(ret)); + } + } + } + } + } + } + if (is_iter_end || row_count >= top_limit_cnt_) { + stop_loop = true; + } + if (OB_SUCC(ret) || is_iter_end) { // if into user variables or into dumpfile, must be one row + if ((T_INTO_VARIABLES == into_type || T_INTO_DUMPFILE == into_type) && row_count > 1) { + ret = OB_ERR_TOO_MANY_ROWS; + LOG_WARN("more than one row for into variables or into dumpfile", K(ret), K(row_count)); + } + } + } //end while + if (OB_SUCC(ret)) { // set affected rows + phy_plan_ctx->set_affected_rows(row_count); + } + if (OB_FAIL(ret)) { + need_commit_ = false; + } + return ret; + } + + int ObSelectIntoOp::inner_rescan() + { + int ret = OB_SUCCESS; + return ret; + } + + int ObSelectIntoOp::inner_close() + { + int ret = OB_SUCCESS; + ObExternalFileWriter *data_writer = NULL; + int64_t estimated_bytes = 0; + if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { + if (!GCONF._use_odps_jni_connector) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps jni connector is not supported", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps jni connector is not supported", K(ret)); + } + } else if (do_partition_) { + for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); + OB_SUCC(ret) && iter != partition_map_.end(); iter++) { + if (OB_ISNULL(data_writer = iter->second)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("data writer is unexpected null", K(ret)); + } else if (OB_FAIL(data_writer->close_data_writer())) { + LOG_WARN("failed to close data writer", K(ret)); + } + } + } else if (OB_NOT_NULL(data_writer_) && OB_FAIL(data_writer_->close_data_writer())) { + LOG_WARN("failed to close data writer", K(ret)); + } + return ret; + } + + int ObSelectIntoOp::get_row_str(const int64_t buf_len, + bool is_first_row, + char *buf, + int64_t &pos) + { + int ret = OB_SUCCESS; + const ObObj &field_str = field_str_; + char closed_cht = char_enclose_; + //before 4_1 use output + //after 4_1 use select exprs + const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? + MY_SPEC.output_ : MY_SPEC.select_exprs_; + if (!is_first_row && line_str_.is_varying_len_char_type()) { // lines terminated by "a" + ret = databuff_printf(buf, buf_len, pos, "%.*s", line_str_.get_varchar().length(), + line_str_.get_varchar().ptr()); + } + + for (int i = 0 ; OB_SUCC(ret) && i < select_exprs.count() ; i++) { + const ObExpr *expr = select_exprs.at(i); + if (0 != closed_cht && (!is_optional_ || ob_is_string_type(expr->datum_meta_.type_))) { + // closed by "a" (for all cell) or optionally by "a" (for string cell) + if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%c", closed_cht))) { + LOG_WARN("print closed character failed", K(ret), K(closed_cht)); + } + } + if (OB_SUCC(ret)) { + ObObj cell; + ObDatum *datum = NULL; + if (OB_FAIL(expr->eval(eval_ctx_, datum))) { + LOG_WARN("expr eval failed", K(ret)); + } else if (OB_FAIL(datum->to_obj(cell, expr->obj_meta_))) { + LOG_WARN("to obj failed", K(ret)); + } else if (OB_FAIL(cell.print_plain_str_literal(buf, buf_len, pos))) { // cell value + LOG_WARN("print sql failed", K(ret), K(cell)); + } else if (0 != closed_cht && (!is_optional_ || ob_is_string_type(expr->datum_meta_.type_))) { + if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%c", closed_cht))) { + LOG_WARN("print closed character failed", K(ret), K(closed_cht)); + } + } + // field terminated by "a" + if (OB_SUCC(ret) && i != select_exprs.count() - 1 && field_str.is_varying_len_char_type()) { + if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%.*s", field_str.get_varchar().length(), field_str.get_varchar().ptr()))) { + LOG_WARN("print field str failed", K(ret), K(field_str)); + } + } + } + } + + return ret; + } + + int ObSelectIntoOp::calc_first_file_path(ObString &path) + { + int ret = OB_SUCCESS; + ObSqlString file_name_with_suffix; + ObString file_extension; + ObSelectIntoOpInput *input = static_cast(input_); + ObString input_file_name = file_location_ != IntoFileLocation::SERVER_DISK + ? path.split_on('?').trim() + : path; + if (OB_ISNULL(input)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("op input is null", K(ret)); + } else if (input_file_name.length() == 0 || path.length() == 0) { + ret = OB_INVALID_ARGUMENT; + LOG_USER_ERROR(OB_INVALID_ARGUMENT, "invalid outfile path"); + LOG_WARN("invalid outfile path", K(ret)); + } else { + if (input_file_name.ptr()[input_file_name.length() - 1] == '/'){ + OZ(file_name_with_suffix.append_fmt("%.*sdata", input_file_name.length(), input_file_name.ptr())); + } else { + OZ(file_name_with_suffix.append_fmt("%.*s", input_file_name.length(), input_file_name.ptr())); + } + if (MY_SPEC.parallel_ > 1) { + OZ(file_name_with_suffix.append_fmt("_%ld_%ld_%d", input->sqc_id_, input->task_id_, 0)); + } else { + OZ(file_name_with_suffix.append_fmt("_%d", 0)); + } + OZ(external_properties_.get_format_file_extension(format_type_, file_extension)); + if (!file_extension.empty() && file_extension.ptr()[0] != '.') { + OZ(file_name_with_suffix.append(".")); + } + OZ(file_name_with_suffix.append(file_extension)); + if (format_type_ == ObExternalFileFormat::FormatType::CSV_FORMAT) { + OZ(file_name_with_suffix.append(compression_algorithm_to_suffix(external_properties_.csv_format_.compression_algorithm_))); + } + if (file_location_ != IntoFileLocation::SERVER_DISK) { + OZ(file_name_with_suffix.append_fmt("?%.*s", path.length(), path.ptr())); + } + if (OB_SUCC(ret) && OB_FAIL(ob_write_string(ctx_.get_allocator(), file_name_with_suffix.string(), path))) { + LOG_WARN("failed to write string", K(ret)); + } + } + return ret; + } + + int ObSelectIntoOp::calc_next_file_path(ObExternalFileWriter &data_writer) + { + int ret = OB_SUCCESS; + ObSqlString url_with_suffix; + ObString file_path; + data_writer.split_file_id_++; + if (data_writer.split_file_id_ > 0) { + if (MY_SPEC.is_single_ && IntoFileLocation::SERVER_DISK != file_location_) { + file_path = (data_writer.split_file_id_ > 1) + ? data_writer.url_.split_on(data_writer.url_.reverse_find('.')) + : data_writer.url_; + if (OB_FAIL(url_with_suffix.assign(file_path))) { + LOG_WARN("failed to assign string", K(ret)); + } else if (OB_FAIL(url_with_suffix.append_fmt(".extend%ld", data_writer.split_file_id_))) { + LOG_WARN("failed to append string", K(ret)); + } + } else if (!MY_SPEC.is_single_) { + file_path = data_writer.url_.split_on(data_writer.url_.reverse_find('_')); + if (OB_FAIL(url_with_suffix.assign(file_path))) { + LOG_WARN("failed to assign string", K(ret)); + } else if (OB_FAIL(url_with_suffix.append_fmt("_%ld", data_writer.split_file_id_))) { + LOG_WARN("failed to append string", K(ret)); + } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected single value", K(ret)); + } + if (!MY_SPEC.is_single_) { + ObString file_extension; + OZ(external_properties_.get_format_file_extension(format_type_, file_extension)); + if (!file_extension.empty() && file_extension.ptr()[0] != '.') { + OZ(url_with_suffix.append(".")); + } + OZ(url_with_suffix.append(file_extension)); + } + if (!MY_SPEC.is_single_ + && format_type_ == ObExternalFileFormat::FormatType::CSV_FORMAT) { + OZ(url_with_suffix.append(compression_algorithm_to_suffix(external_properties_.csv_format_.compression_algorithm_))); + } + if (OB_SUCC(ret) && OB_FAIL(ob_write_string(ctx_.get_allocator(), + url_with_suffix.string(), + data_writer.url_, true))) { + LOG_WARN("failed to write string", K(ret)); + } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected split file id", K(ret)); + } + return ret; + } + // Set the current data_writer's url_ based on the incoming partition and basic_url_, each partition only needs to be calculated once, subsequent changes only need to modify the split id + int ObSelectIntoOp::calc_file_path_with_partition(ObString partition, ObExternalFileWriter &data_writer) + { + int ret = OB_SUCCESS; + ObSqlString url_with_partition; + ObString dir_path; + if (OB_FAIL(ob_write_string(ctx_.get_allocator(), basic_url_, data_writer.url_))) { + LOG_WARN("failed to write string", K(ret)); + } else { + dir_path = data_writer.url_.split_on(data_writer.url_.reverse_find('/')); + if (OB_FAIL(url_with_partition.assign(dir_path))) { + LOG_WARN("failed to assign string", K(ret)); + } else if (url_with_partition.length() != 0 && OB_FAIL(url_with_partition.append("/"))) { + LOG_WARN("failed to append string", K(ret)); + } else if (partition.length() != 0 && OB_FAIL(url_with_partition.append_fmt("%.*s/", + partition.length(), + partition.ptr()))) { + LOG_WARN("failed to append string", K(ret)); + } else if (partition.length() == 0 && OB_FAIL(url_with_partition.append("__NULL__/"))) { + LOG_WARN("failed to append string", K(ret)); + } else if (OB_FAIL(url_with_partition.append_fmt("%.*s", + data_writer.url_.length(), + data_writer.url_.ptr()))) { + LOG_WARN("failed to append string", K(ret)); + } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), + url_with_partition.string(), + data_writer.url_, + true))) { + LOG_WARN("failed to write string", K(ret)); + } + } + return ret; + } + + int ObSelectIntoOp::split_file(ObExternalFileWriter &data_writer) + { + int ret = OB_SUCCESS; + if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_) { + ObCsvFileWriter *csv_data_writer = static_cast(&data_writer); + if (OB_ISNULL(csv_data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null data writer", K(ret)); + } else if (!use_shared_buf_ && OB_FAIL(csv_data_writer->flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (has_lob_ && use_shared_buf_ && OB_FAIL(csv_data_writer->flush_shared_buf(shared_buf_))) { + // To ensure the integrity of each line in the file, when there is a lob, the shared buffer may not contain a complete line + // Therefore the remaining content in the shared buffer also needs to be flushed to the current file, in this case, the max_file_size limit cannot be strictly enforced + LOG_WARN("failed to flush shared buffer", K(ret)); + } + } + if (OB_FAIL(ret)) { + } else if (OB_FAIL(data_writer.close_file())) { + LOG_WARN("failed to close file", K(ret)); + } else if (OB_FAIL(calc_next_file_path(data_writer))) { + LOG_WARN("failed to calculate new file path", K(ret)); + } + return ret; + } + + int ObSelectIntoOp::check_csv_file_size(ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + int64_t curr_bytes = data_writer.get_file_size(); + int64_t curr_bytes_exclude_curr_line = data_writer.get_curr_bytes_exclude_curr_line(); + int64_t curr_line_len = curr_bytes - curr_bytes_exclude_curr_line; + bool has_split = false; + bool has_use_shared_buf = use_shared_buf_; + if (has_compress_ && OB_ISNULL(data_writer.get_compress_stream_writer())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null compress stream writer", K(ret)); + } else if (!(has_lob_ && has_use_shared_buf) && curr_bytes_exclude_curr_line == 0) { + } else if (file_need_split(curr_bytes)) { + if (OB_FAIL(split_file(data_writer))) { + LOG_WARN("failed to split file", K(ret)); + } else { + has_split = true; + } + } + if (OB_SUCC(ret)) { + if (has_lob_ && has_use_shared_buf) { + if (!has_compress_) { + data_writer.set_write_bytes(has_split ? 0 : curr_bytes); + } + data_writer.reset_curr_line_len(); + } else { + if (!has_compress_) { + data_writer.set_write_bytes(has_split ? curr_line_len : curr_bytes); + } + } + if (has_compress_ && has_split) { + data_writer.get_compress_stream_writer()->reuse(); + } + data_writer.update_last_line_pos(); + } + return ret; + } + + int ObSelectIntoOp::get_buf(char* &buf, int64_t &buf_len, int64_t &pos, ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + buf = use_shared_buf_ ? get_shared_buf() : data_writer.get_buf(); + buf_len = use_shared_buf_ ? get_shared_buf_len() : data_writer.get_buf_len(); + pos = data_writer.get_curr_pos(); + if (OB_ISNULL(buf) && !use_shared_buf_ && OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } else if (OB_ISNULL(buf)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("buf should not be null", K(ret)); + } + return ret; + } + + int ObSelectIntoOp::use_shared_buf(ObCsvFileWriter &data_writer, char* &buf, - uint32_t &res_len) -{ - int ret = OB_SUCCESS; - ObString ob_str; - ObString res_str; - bool has_lob_header = obj_meta.has_lob_header(); - res_len = 0; - buf = nullptr; - int64_t buf_size = 0; - if (OB_FAIL(ObTextStringHelper::read_real_string_data(allocator, expr_vector, datum_meta, - has_lob_header, ob_str, row_idx))) { - LOG_WARN("failed to get string", K(ret)); - } else if (ob_str.length() == 0 || CS_TYPE_BINARY == datum_meta.cs_type_ - || CHARSET_UTF8MB4 == ObCharset::charset_type_by_coll(datum_meta.cs_type_)) { - if (OB_FAIL(ob_write_string(allocator, ob_str, res_str))) { - LOG_WARN("failed to write string", K(ret)); - } else { - res_len = static_cast(res_str.length()); - buf = const_cast(res_str.ptr()); - } - } else if (OB_FALSE_IT(buf_size = ob_str.length() * ObCharset::MAX_MB_LEN)) { - } else if (OB_ISNULL(buf = static_cast(allocator.alloc(buf_size)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to alloc memory", K(ret), K(buf_size)); - } else if (OB_FAIL(ObCharset::charset_convert(datum_meta.cs_type_, ob_str.ptr(), - ob_str.length(), CS_TYPE_UTF8MB4_BIN, - buf, buf_size, res_len, false, false))) { - LOG_WARN("failed to convert charset", K(ret)); - } - return ret; -} - -#ifndef OB_BUILD_EMBED_MODE -int ObSelectIntoOp::init_parquet_env() -{ - int ret = OB_SUCCESS; - arrow_alloc_.init(MTL_ID()); - if (OB_FAIL(setup_parquet_schema())) { - LOG_WARN("failed to set up parquet schema", K(ret)); - } else if (OB_FAIL(init_env_common())) { - LOG_WARN("failed to init env common", K(ret)); - } - return ret; -} - -int ObSelectIntoOp::get_parquet_logical_type(std::shared_ptr &logical_type, - const ObObjType &obj_type, - const int32_t precision, - const int32_t scale) -{ - int ret = OB_SUCCESS; - if (ObTinyIntType == obj_type) { - logical_type = parquet::LogicalType::Int(8, true); - } else if (ObSmallIntType == obj_type) { - logical_type = parquet::LogicalType::Int(16, true); - } else if (ObMediumIntType == obj_type || ObInt32Type == obj_type) { - logical_type = parquet::LogicalType::Int(32, true); - } else if (ObIntType == obj_type) { - logical_type = parquet::LogicalType::Int(64, true); - } else if (ObUTinyIntType == obj_type) { - logical_type = parquet::LogicalType::Int(8, false); - } else if (ObUSmallIntType == obj_type) { - logical_type = parquet::LogicalType::Int(16, false); - } else if (ObUMediumIntType == obj_type || ObUInt32Type == obj_type) { - logical_type = parquet::LogicalType::Int(32, false); - } else if (ObUInt64Type == obj_type) { - logical_type = parquet::LogicalType::Int(64, false); - } else if (ob_is_float_tc(obj_type) || ob_is_double_tc(obj_type)) { // float, ufloat, double, udouble - logical_type = parquet::LogicalType::None(); - } else if (ob_is_number_or_decimal_int_tc(obj_type)) { - logical_type = parquet::LogicalType::Decimal(precision, scale); - } else if (ob_is_datetime_or_mysql_datetime(obj_type)) { - logical_type = parquet::LogicalType::Timestamp(false, parquet::LogicalType::TimeUnit::MICROS); - } else if (ObTimestampType == obj_type) { - logical_type = parquet::LogicalType::Timestamp(true, parquet::LogicalType::TimeUnit::MICROS); - } else if (ob_is_date_or_mysql_date(obj_type)) { - logical_type = parquet::LogicalType::Date(); - } else if (ob_is_time_tc(obj_type)) { - logical_type = parquet::LogicalType::Time(false, parquet::LogicalType::TimeUnit::MICROS); - } else if (ob_is_year_tc(obj_type)) { - logical_type = parquet::LogicalType::Int(8, false); - } else if (ob_is_string_type(obj_type) || ObNullType == obj_type) { - logical_type = parquet::LogicalType::String(); - } else if (ob_is_bit_tc(obj_type) /*uint64_t*/) { - logical_type = parquet::LogicalType::Int(64, false); - } else if (ob_is_enum_or_set_type(obj_type) /*uint64_t*/) { - logical_type = parquet::LogicalType::Enum(); - } else { - // TODO(bitao): support json/bson/uuid/map/list - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "convert this ob type to parquet type"); - LOG_WARN("unsupported obj type", K(ret), K(obj_type)); - } - return ret; -} - -int ObSelectIntoOp::get_parquet_physical_type(parquet::Type::type &physical_type, - const ObObjType &obj_type) -{ - int ret = OB_SUCCESS; - if (ObTinyIntType == obj_type || ObSmallIntType == obj_type - || ObMediumIntType == obj_type || ObInt32Type == obj_type - || ObUTinyIntType == obj_type || ObUSmallIntType == obj_type - || ObUMediumIntType == obj_type || ObUInt32Type == obj_type - || ob_is_date_or_mysql_date(obj_type) || ob_is_year_tc(obj_type)) { - physical_type = parquet::Type::INT32; - } else if (ObIntType == obj_type || ObUInt64Type == obj_type - || ob_is_datetime_or_mysql_datetime_tc(obj_type) - || ob_is_time_tc(obj_type) || ob_is_bit_tc(obj_type)) { - physical_type = parquet::Type::INT64; - } else if (ob_is_float_tc(obj_type)) { // float, ufloat - physical_type = parquet::Type::FLOAT; - } else if (ob_is_double_tc(obj_type)) { // double, udouble - physical_type = parquet::Type::DOUBLE; - } else if (ob_is_number_or_decimal_int_tc(obj_type)) { - physical_type = parquet::Type::FIXED_LEN_BYTE_ARRAY; - } else if (ob_is_string_tc(obj_type) /*varchar,char,varbinary,binary*/ - || ob_is_text_tc(obj_type) /*TinyText,MediumText,Text,LongText,TinyBLOB,MediumBLOB,BLOB,LongBLOB*/ - || ob_is_enum_or_set_type(obj_type) - || ObNullType == obj_type) { - physical_type = parquet::Type::BYTE_ARRAY; - } else { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "convert this ob type to parquet type"); - LOG_WARN("unsupported obj type", K(ret), K(obj_type)); - } - return ret; -} - -int ObSelectIntoOp::calc_parquet_decimal_length(int precision) -{ - // Put in utils? - return std::ceil((1 + precision / std::log10(2)) / 8); -} - -int ObSelectIntoOp::setup_parquet_schema() -{ - int ret = OB_SUCCESS; - ObMallocHookAttrGuard guard(ObMemAttr(MTL_ID(), "IntoParquet")); - parquet::schema::NodeVector fields; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - std::shared_ptr logical_type; - parquet::Type::type physical_type; - parquet::schema::NodePtr node; - try { - for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { - ObDatumMeta meta = select_exprs.at(i)->datum_meta_; - ObObjType obj_type = meta.get_type(); - ObString alias_name = MY_SPEC.alias_names_.strs_.at(i); - std::string column_name(alias_name.ptr(), alias_name.length()); - int primitive_length = -1; - if (OB_FAIL(check_oracle_number(obj_type, - select_exprs.at(i)->datum_meta_.precision_, - select_exprs.at(i)->datum_meta_.scale_))) { - LOG_WARN("not support number type", K(ret)); - } else if (OB_FAIL(get_parquet_logical_type(logical_type, - obj_type, - select_exprs.at(i)->datum_meta_.precision_, - select_exprs.at(i)->datum_meta_.scale_))) { - LOG_WARN("failed to get related logical type", K(ret)); - } else if (OB_FAIL(get_parquet_physical_type(physical_type, obj_type))) { - LOG_WARN("failed to get related physical type", K(ret)); - } else if (ob_is_number_or_decimal_int_tc(obj_type) - && OB_FALSE_IT(primitive_length = calc_parquet_decimal_length( - select_exprs.at(i)->datum_meta_.precision_))) { - } else { - //todo@linyi repetition level - node = parquet::schema::PrimitiveNode::Make(column_name, parquet::Repetition::OPTIONAL, - logical_type, physical_type, primitive_length); - fields.push_back(node); - } - } - if (OB_SUCC(ret)) { - parquet_writer_schema_ = std::static_pointer_cast( - parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); - } - } catch (const std::exception& ex) { - if (OB_SUCC(ret)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("caught exception when setup parquet schema", K(ret), "Info", ex.what()); - LOG_USER_ERROR(OB_ERR_UNEXPECTED, ex.what()); - } - } catch (...) { - if (OB_SUCC(ret)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("caught exception when setup parquet schema", K(ret)); - } - } - return ret; -} - -int ObSelectIntoOp::into_outfile_batch_parquet(const ObBatchRows &brs, ObExternalFileWriter *data_writer) -{ - int ret = OB_SUCCESS; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - ObArray expr_vectors; - common::ObIVector* partition_vector; - int64_t estimated_bytes = 0; - int64_t row_group_size = 0; - int64_t file_size = 0; - ObParquetFileWriter *parquet_data_writer = NULL; - ObSQLMode sql_mode = eval_ctx_.exec_ctx_.get_my_session()->get_sql_mode(); - ObDateSqlMode date_sql_mode; - date_sql_mode.init(sql_mode); - bool is_strict_mode = common::is_strict_mode(sql_mode); - for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { - if (OB_ISNULL(select_exprs.at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (OB_FAIL(select_exprs.at(i)->eval_vector(eval_ctx_, brs))) { - LOG_WARN("failed to eval vector", K(ret)); - } else if (OB_FAIL(expr_vectors.push_back(select_exprs.at(i)->get_vector(eval_ctx_)))) { - LOG_WARN("failed to push back vector", K(ret)); - } - } - if (OB_SUCC(ret) && do_partition_) { - if (OB_FAIL(MY_SPEC.file_partition_expr_->eval_vector(eval_ctx_, brs))) { - LOG_WARN("failed to eval batch", K(ret)); - } else if (OB_ISNULL(partition_vector = MY_SPEC.file_partition_expr_->get_vector(eval_ctx_))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null vector", K(ret)); - } - } - for (int64_t row_idx = 0; OB_SUCC(ret) && row_idx < brs.size_; ++row_idx) { - if (brs.skip_->contain(row_idx)) { - // do nothing - } else if (do_partition_ && OB_FAIL(get_data_writer_for_partition(partition_vector->get_string(row_idx), - data_writer))) { - LOG_WARN("failed to set data writer for partition", K(ret)); - } else if (OB_ISNULL(parquet_data_writer = static_cast(data_writer))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null data writer", K(ret)); - } else if (parquet_data_writer->is_file_writer_null() - && OB_FAIL(parquet_data_writer->open_parquet_file_writer(arrow_alloc_, - external_properties_.parquet_format_.row_group_size_, - external_properties_.parquet_format_.compress_type_index_, - brs.size_, - ctx_.get_allocator()))) { - LOG_WARN("failed to init parquet file writer", K(ret)); - } else if (!parquet_data_writer->is_valid_to_write()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else { - try { - for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); col_idx++) { - if (OB_FAIL(build_parquet_cell(parquet_data_writer->get_row_group_writer(), - select_exprs.at(col_idx)->datum_meta_, - select_exprs.at(col_idx)->obj_meta_, - expr_vectors.at(col_idx), - col_idx, - row_idx, - parquet_data_writer->get_row_batch_offset(), - parquet_data_writer->get_parquet_value_offsets().at(col_idx), - parquet_data_writer->get_parquet_row_def_levels().at(col_idx), - parquet_data_writer->get_batch_allocator(), - parquet_data_writer->get_parquet_row_batch().at(col_idx), - is_strict_mode, - date_sql_mode))) { - LOG_WARN("failed to build parquet cell", K(ret)); - } - } - parquet_data_writer->set_batch_written(false); - parquet_data_writer->increase_row_batch_offset(); - if (OB_FAIL(ret)) { - // discard unwritten data if an error occurs - parquet_data_writer->set_batch_written(true); - parquet_data_writer->reset_row_batch_offset(); - parquet_data_writer->reset_value_offsets(); - } else if (parquet_data_writer->reach_batch_end()) { - if (OB_FAIL(parquet_data_writer->write_file())) { - LOG_WARN("failed to write parquet row batch", K(ret)); - } else if (OB_FAIL(check_parquet_file_size(*parquet_data_writer))) { - LOG_WARN("failed to check parquet file size", K(ret)); - } - parquet_data_writer->set_batch_written(true); - parquet_data_writer->reset_row_batch_offset(); - parquet_data_writer->reset_value_offsets(); - } - } catch (const std::exception& ex) { - if (OB_SUCC(ret)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("caught exception when write parquet file", K(ret), "Info", ex.what()); - LOG_USER_ERROR(OB_ERR_UNEXPECTED, ex.what()); - } - } catch (...) { - if (OB_SUCC(ret)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("caught exception when write parquet file", K(ret)); - } - } - } - } - return ret; -} - -int ObSelectIntoOp::oracle_timestamp_to_int96(const common::ObIVector* expr_vector, - int64_t row_idx, - const ObDatumMeta &datum_meta, - parquet::Int96 &res) -{ - int ret = OB_SUCCESS; - int64_t out_usec = 0; - int32_t tmp_offset = 0; - ObOTimestampData oracle_timestamp; - uint32_t julian_date_value = (out_usec / 86400000000LL) + 2440588; - uint64_t nsec_time_value = oracle_timestamp.time_ctx_.tail_nsec_ + std::abs(out_usec % 86400000000LL) * 1000; - res.value[2] = julian_date_value; - res.value[1] = nsec_time_value >> 32; - res.value[0] = nsec_time_value & UINT32_MAX; - return ret; -} - -int ObSelectIntoOp::check_parquet_file_size(ObParquetFileWriter &data_writer) -{ - int ret = OB_SUCCESS; - int64_t row_group_size = data_writer.get_row_group_size(); - int64_t file_size = data_writer.get_file_size(); - if (file_need_split(file_size)) { - if (OB_FAIL(split_file(data_writer))) { - LOG_WARN("failed to split file", K(ret)); - } else { - data_writer.set_write_bytes(0); - } - } else if (row_group_size > external_properties_.parquet_format_.row_group_size_) { - data_writer.get_row_group_writer()->Close(); - data_writer.set_write_bytes(file_size); - data_writer.open_next_row_group_writer(); - } - return ret; -} - -int ObSelectIntoOp::build_parquet_cell(parquet::RowGroupWriter* rg_writer, + int64_t &buf_len, + int64_t &pos) + { + int ret = OB_SUCCESS; + int64_t curr_pos = data_writer.get_curr_pos(); + if (!use_shared_buf_ && data_writer.get_last_line_pos() == 0) { + if (OB_NOT_NULL(data_writer.get_buf()) && curr_pos > 0) { + MEMCPY(shared_buf_, data_writer.get_buf(), curr_pos); + } + use_shared_buf_ = true; + buf = shared_buf_; + buf_len = shared_buf_len_; + pos = curr_pos; + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("last line should be flushed before this line copied", K(ret)); + } + return ret; + } + + int ObSelectIntoOp::resize_buf(char* &buf, + int64_t &buf_len, + int64_t &pos, + int64_t curr_pos, + bool is_json) + { + int ret = OB_SUCCESS; + int64_t new_buf_len = buf_len * 2; + char* new_buf = NULL; + if (OB_ISNULL(new_buf = static_cast(ctx_.get_allocator().alloc(new_buf_len)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate buffer", K(ret), K(new_buf_len)); + } else if (!is_json) { + if (curr_pos > 0) { + MEMCPY(new_buf, shared_buf_, curr_pos); + } + shared_buf_ = new_buf; + shared_buf_len_ = new_buf_len; + } else { + json_buf_ = new_buf; + json_buf_len_ = new_buf_len; + } + if (OB_SUCC(ret)) { + buf = new_buf; + buf_len = new_buf_len; + pos = is_json ? 0 : curr_pos; + } + return ret; + } + + int ObSelectIntoOp::resize_or_flush_shared_buf(ObCsvFileWriter &data_writer, + char* &buf, + int64_t &buf_len, + int64_t &pos) + { + int ret = OB_SUCCESS; + if (!use_shared_buf_) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get invalid argument", K(use_shared_buf_), K(ret)); + } else if (has_lob_ && data_writer.get_curr_pos() > 0) { + if (OB_FAIL(data_writer.flush_shared_buf(shared_buf_, true))) { + LOG_WARN("failed to flush shared buffer", K(ret)); + } else { + pos = 0; + } + } else if (OB_FAIL(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos()))) { + LOG_WARN("failed to resize shared buffer", K(ret)); + } + return ret; + } + + int ObSelectIntoOp::check_buf_sufficient(ObCsvFileWriter &data_writer, + char* &buf, + int64_t &buf_len, + int64_t &pos, + int64_t str_len) + { + int ret = OB_SUCCESS; + if (buf_len < str_len * 1.1) { + if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + return ret; + } + + int ObSelectIntoOp::write_obj_to_file(const ObObj &obj, ObCsvFileWriter &data_writer, bool need_escape) + { + int ret = OB_SUCCESS; + // binary collation do not require to escape when encode with base64/hex + if (obj.get_collation_type() == CS_TYPE_BINARY && + (print_params_.binary_string_print_hex_ || print_params_.binary_string_print_base64_)) { + need_escape = false; + } + + if ((obj.is_string_type() || obj.is_json() || obj.is_collection_sql_type()) && need_escape) { + if (OB_FAIL(print_str_or_json_with_escape(obj, data_writer))) { + LOG_WARN("failed to print str or json with escape", K(ret)); + } + } else if (OB_FAIL(print_normal_obj_without_escape(obj, data_writer))) { + LOG_WARN("failed to print normal obj without escape", K(ret)); + } + return ret; + } + + int ObSelectIntoOp::print_str_or_json_with_escape(const ObObj &obj, ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + char* buf = NULL; + int64_t buf_len = 0; + int64_t pos = 0; + ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); + ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); + escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type + || src_type == CHARSET_INVALID); + escape_printer_.need_enclose_ = has_enclose_ && !obj.is_null(); + escape_printer_.do_escape_ = true; + escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY + && print_params_.binary_string_print_hex_; + ObString str_to_escape; + ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); + common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); + if (OB_FAIL(get_buf(escape_printer_.buf_, escape_printer_.buf_len_, escape_printer_.pos_, data_writer))) { + LOG_WARN("failed to get buffer", K(ret)); + } else if (obj.is_json() || obj.is_collection_sql_type()) { + ObObj inrow_obj = obj; + if (obj.is_lob_storage() + && OB_FAIL(ObTextStringIter::convert_outrow_lob_to_inrow_templob(obj, inrow_obj, NULL, &temp_allocator))) { + LOG_WARN("failed to convert outrow lobs", K(ret), K(obj)); + } else if (obj.is_collection_sql_type()) { + ObSubSchemaValue sub_meta; + if (OB_FAIL((get_exec_ctx().get_sqludt_meta_by_subschema_id(obj.get_meta().get_subschema_id(), sub_meta)))) { + LOG_WARN("failed to get collection subschema", K(ret), K(obj.get_meta().get_subschema_id())); + } else { + print_params_.coll_meta_ = reinterpret_cast(sub_meta.value_); + } + } + if (OB_FAIL(ret)) { + } else if (OB_FAIL(print_json_to_json_buf(inrow_obj, buf, buf_len, pos, data_writer))) { + LOG_WARN("failed to print normal obj without escape", K(ret)); + } else { + str_to_escape.assign_ptr(buf, pos); + escape_printer_.do_encode_ = false; + } + } else { + str_to_escape = obj.get_varchar(); + } + if (OB_SUCC(ret) && !use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_, + str_to_escape.length()))) { + LOG_WARN("failed to check if buf is sufficient", K(ret)); + } + if (OB_SUCC(ret) && !use_shared_buf_) { + if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_))) { + if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); + } else if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { + } else if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_))) { + if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); + } else if (OB_FAIL(use_shared_buf(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + } + } + if (OB_SUCC(ret) && use_shared_buf_) { + do { + if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_))) { + LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); + } + } while (OB_SIZE_OVERFLOW == ret && OB_SUCC(resize_or_flush_shared_buf(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_))); + if (OB_FAIL(ret)) { + LOG_WARN("failed to print plain str", K(ret)); + } + } + if (OB_SUCC(ret)) { + data_writer.set_curr_pos(escape_printer_.pos_); + } + + return ret; + } + + int ObSelectIntoOp::print_normal_obj_without_escape(const ObObj &obj, ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + char* buf = NULL; + int64_t buf_len = 0; + int64_t pos = 0; + OZ(get_buf(buf, buf_len, pos, data_writer)); + if (OB_SUCC(ret) && !use_shared_buf_) { + if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { + if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print obj", K(ret)); + } else if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FALSE_IT(pos = data_writer.get_curr_pos())) { + } else if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { + if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print obj", K(ret)); + } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + } + } + if (OB_SUCC(ret) && use_shared_buf_) { + do { + if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { + LOG_WARN("failed to print obj", K(ret)); + } + } while (OB_SIZE_OVERFLOW == ret + && OB_SUCC(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))); + if (OB_FAIL(ret)) { + LOG_WARN("failed to print obj", K(ret)); + } + } + if (OB_SUCC(ret)) { + data_writer.set_curr_pos(pos); + } + return ret; + } + + int ObSelectIntoOp::print_json_to_json_buf(const ObObj &obj, + char* &buf, + int64_t &buf_len, + int64_t &pos, + ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + buf = get_json_buf(); + buf_len = get_json_buf_len(); + pos = 0; + do { + if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { + LOG_WARN("failed to print obj", K(ret)); + } + } while (OB_SIZE_OVERFLOW == ret + && OB_SUCC(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos(), true))); + if (OB_FAIL(ret)) { + LOG_WARN("failed to print json to json buffer", K(ret)); + } + return ret; + } + + int ObSelectIntoOp::write_lob_to_file(const ObObj &obj, + const ObExpr &expr, + const ObDatum &datum, + ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); + ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); + escape_printer_.need_enclose_ = has_enclose_; + escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type + || src_type == CHARSET_INVALID); + escape_printer_.do_escape_ = has_escape_; + escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY + && print_params_.binary_string_print_hex_; + ObDatumMeta input_meta = expr.datum_meta_; + ObTextStringIterState state; + ObString src_block_data; + ObTextStringIter lob_iter(input_meta.type_, input_meta.cs_type_, datum.get_string(), + expr.obj_meta_.has_lob_header()); + ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); + common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); + int64_t truncated_len = 0; + bool stop_when_truncated = false; + OZ(lob_iter.init(0, NULL, &temp_allocator)); + OZ(get_buf(escape_printer_.buf_, escape_printer_.buf_len_, escape_printer_.pos_, data_writer)); + // When truncated_len == src_block_data.length() when truncated length equals source block data length + // Indicates that the current foreach_char is processing only invalid data at the end of the lob, i.e., truncated data from the previous round, to avoid infinite loops + while (OB_SUCC(ret) + && (state = lob_iter.get_next_block(src_block_data)) == TEXTSTRING_ITER_NEXT) { + // outrow lob will only be false on the last iteration, inrow lob iterates only once, and is false + stop_when_truncated = (truncated_len != src_block_data.length()) && lob_iter.is_outrow_lob(); + if (!use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_, + src_block_data.length()))) { + LOG_WARN("failed to check if buf is sufficient", K(ret)); + } + if (OB_SUCC(ret) && !use_shared_buf_) { + if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { + if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { + lob_iter.set_reserved_byte_len(truncated_len); + ret = OB_SUCCESS; + } else if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print lob", K(ret)); + } else if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { + } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { + if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { + lob_iter.set_reserved_byte_len(truncated_len); + ret = OB_SUCCESS; + } else if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print lob", K(ret)); + } else if (OB_FAIL(use_shared_buf(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + } + } + if (OB_SUCC(ret) && use_shared_buf_) { + if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { + if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { + lob_iter.set_reserved_byte_len(truncated_len); + ret = OB_SUCCESS; + } else if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print lob", K(ret)); + } else if (OB_FAIL(data_writer.flush_shared_buf(shared_buf_, true))) { + LOG_WARN("failed to flush shared buffer", K(ret)); + } else if (OB_FALSE_IT(escape_printer_.pos_ = 0)) { + } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { + if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { + lob_iter.set_reserved_byte_len(truncated_len); + ret = OB_SUCCESS; + } else { + LOG_WARN("failed to print lob", K(ret), K(src_block_data.length()), K(shared_buf_len_), + K(data_writer.get_curr_pos()), K(escape_printer_.buf_len_), K(escape_printer_.pos_)); + } + } + } + } + data_writer.set_curr_pos(escape_printer_.pos_); + } + if (OB_FAIL(ret)) { + } else if (state != TEXTSTRING_ITER_NEXT && state != TEXTSTRING_ITER_END) { + ret = (lob_iter.get_inner_ret() != OB_SUCCESS) ? + lob_iter.get_inner_ret() : OB_INVALID_DATA; + LOG_WARN("iter state invalid", K(ret), K(state), K(lob_iter)); + } + return ret; + } + + int ObSelectIntoOp::write_single_char_to_file(const char *wchar, ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + char* buf = NULL; + int64_t buf_len = 0; + int64_t pos = 0; + OZ(get_buf(buf, buf_len, pos, data_writer)); + if (OB_SUCC(ret) && !use_shared_buf_) { + if (pos < buf_len) { + MEMCPY(buf + pos, wchar, 1); + data_writer.set_curr_pos(pos + 1); + } else if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FALSE_IT(pos = data_writer.get_curr_pos())) { + } else if (pos < buf_len) { + MEMCPY(buf + pos, wchar, 1); + data_writer.set_curr_pos(pos + 1); + } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + if (OB_SUCC(ret) && use_shared_buf_) { + if (pos < buf_len) { + MEMCPY(buf + pos, wchar, 1); + data_writer.set_curr_pos(pos + 1); + } else if (OB_FAIL(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to resize or flush shared buffer", K(ret)); + } else if (pos < buf_len) { + MEMCPY(buf + pos, wchar, 1); + data_writer.set_curr_pos(pos + 1); + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected error", K(ret)); + } + } + return ret; + } + + int ObSelectIntoOp::print_lob_field(const ObObj &obj, + const ObExpr &expr, + const ObDatum &datum, + ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + if (has_enclose_) { + OZ(write_single_char_to_file(&char_enclose_, data_writer)); + } + OZ(write_lob_to_file(obj, expr, datum, data_writer)); + if (has_enclose_) { + OZ(write_single_char_to_file(&char_enclose_, data_writer)); + } + return ret; + } + + int ObSelectIntoOp::print_field(const ObObj &obj, ObCsvFileWriter &data_writer) + { + int ret = OB_SUCCESS; + char char_n = 'N'; + const bool need_enclose = has_enclose_ && !obj.is_null() + && (!is_optional_ || obj.is_string_type() || obj.is_collection_sql_type() + || obj.is_json() || obj.is_geometry() || obj.is_date() + || obj.is_time() || obj.is_timestamp() || obj.is_datetime() + || obj.is_mysql_date() || obj.is_mysql_datetime()); + if (need_enclose) { + OZ(write_single_char_to_file(&char_enclose_, data_writer)); + } + if (!has_escape_) { + OZ(write_obj_to_file(obj, data_writer, false)); + } else if (obj.is_null()) { + OZ(write_single_char_to_file(&char_escape_, data_writer)); + OZ(write_single_char_to_file(&char_n, data_writer)); + } else { + OZ(write_obj_to_file(obj, data_writer, true)); + } + if (need_enclose) { + OZ(write_single_char_to_file(&char_enclose_, data_writer)); + } + return ret; + } + + int ObSelectIntoOp::into_outfile(ObExternalFileWriter *data_writer) + { + int ret = OB_SUCCESS; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + ObDatum *datum = NULL; + ObObj obj; + ObDatum *partition_datum = NULL; + ObCsvFileWriter *csv_data_writer = NULL; + if (do_partition_) { + if (OB_FAIL(MY_SPEC.file_partition_expr_->eval(eval_ctx_, partition_datum))) { + LOG_WARN("eval expr failed", K(ret)); + } else if (OB_ISNULL(partition_datum)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (OB_FAIL(get_data_writer_for_partition(partition_datum->get_string(), data_writer))) { + LOG_WARN("failed to set data writer for partition", K(ret)); + } + } + if (OB_SUCC(ret)) { + if (OB_ISNULL(csv_data_writer = static_cast(data_writer))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null data writer", K(ret)); + } + } + for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { + if (OB_ISNULL(select_exprs.at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("select expr is unexpected null", K(ret)); + } else if (OB_FAIL(select_exprs.at(i)->eval(eval_ctx_, datum))) { + LOG_WARN("eval expr failed", K(ret)); + } else if (OB_ISNULL(datum)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("datum is unexpected null", K(ret)); + } else if (OB_FAIL(datum->to_obj(obj, + select_exprs.at(i)->obj_meta_, + select_exprs.at(i)->obj_datum_map_))) { + LOG_WARN("failed to get obj from datum", K(ret)); + } else if (!ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type()) || obj.is_null()) { + OZ(print_field(obj, *csv_data_writer)); + } else { // text tc + OZ(print_lob_field(obj, *select_exprs.at(i), *datum, *csv_data_writer)); + } + // print field terminator + if (OB_SUCC(ret) && i != select_exprs.count() - 1) { + OZ(write_obj_to_file(field_str_, *csv_data_writer)); + } + } + // print line terminator + OZ(write_obj_to_file(line_str_, *csv_data_writer)); + // check if need split file + OZ(check_csv_file_size(*csv_data_writer)); + // clear shared buffer + OZ(csv_data_writer->flush_shared_buf(shared_buf_)); + if (has_compress_) { + OZ(csv_data_writer->flush_buf()); + } + return ret; + } + + static OB_INLINE int get_cast_ret(const bool is_strict_mode, int ret) + { + if (OB_SUCCESS != ret && !is_strict_mode) { + ret = OB_SUCCESS; + } + return ret; + } + + #define CAST_FAIL(stmt) \ + (OB_UNLIKELY((OB_SUCCESS != (ret = get_cast_ret((is_strict_mode), (stmt)))))) + + + int ObSelectIntoOp::decimal_to_string(const ObDatum &datum, const ObDatumMeta &datum_meta, - const ObObjMeta &obj_meta, - const common::ObIVector* expr_vector, - int64_t col_idx, - int64_t row_idx, - int64_t row_offset, - int64_t &value_offset, - int16_t* definition_levels, - ObIAllocator &allocator, - void* value_batch, - const bool is_strict_mode, - const ObDateSqlMode date_sql_mode) -{ - int ret = OB_SUCCESS; - int16_t null_definition_level = 0; - int16_t normal_definition_level = 1; - std::shared_ptr p_node; - parquet::ColumnWriter *col_writer = nullptr; - if (OB_ISNULL(expr_vector) || !parquet_writer_schema_ || OB_ISNULL(rg_writer) - || OB_ISNULL(col_writer = rg_writer->column(col_idx)) - || OB_ISNULL(definition_levels) || OB_ISNULL(value_batch) - || !(p_node = std::static_pointer_cast(parquet_writer_schema_->field(col_idx)))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get null ptr", K(ret)); - } else if (p_node->is_group()) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "group type in parquet"); - LOG_WARN("not support group type in parquet", K(ret)); - } else { - switch (p_node->physical_type()) { - case parquet::Type::BYTE_ARRAY: - { - parquet::ByteArray* value = reinterpret_cast(value_batch); - value += value_offset; - char *buf = nullptr; - uint32_t res_len = 0; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (OB_FAIL(calc_byte_array(expr_vector, - row_idx, - datum_meta, - obj_meta, - allocator, - buf, - res_len))) { - LOG_WARN("failed to calc parquet byte array", K(ret)); - } else { - value->ptr = reinterpret_cast(buf); - value->len = res_len; - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::FIXED_LEN_BYTE_ARRAY: - { - parquet::FixedLenByteArray* value = reinterpret_cast(value_batch); - value += value_offset; - parquet::FixedLenByteArrayWriter *writer = static_cast(col_writer); - int parquet_decimal_length = writer->descr()->type_length(); - ObArrayWrap parquet_flba; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (OB_FAIL(parquet_flba.allocate_array(allocator, parquet_decimal_length))) { - LOG_WARN("failed to allocate array", K(ret)); - } else if (OB_FAIL(calc_parquet_decimal_array(expr_vector, - row_idx, - datum_meta, - parquet_decimal_length, - parquet_flba.get_data()))) { - LOG_WARN("failed to calc parquet decimal", K(ret)); - } else { - value->ptr = parquet_flba.get_data(); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::DOUBLE: - { - double* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else { - *value = expr_vector->get_double(row_idx); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::FLOAT: - { - float* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else { - *value = expr_vector->get_float(row_idx); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::INT32: - { - int32_t* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (ob_is_mysql_date_tc(datum_meta.type_)) { - ObMySQLDate mdate(expr_vector->get_int32(row_idx)); - if (CAST_FAIL(ObTimeConverter::mdate_to_date(mdate, *value, date_sql_mode))) { - LOG_WARN("mdate_to_date fail", K(ret)); - } else { - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - } else { - *value = expr_vector->get_int32(row_idx); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::INT64: - { - int64_t* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (ob_is_mysql_datetime(datum_meta.type_)) { - ObMySQLDateTime mdatetime(expr_vector->get_int(row_idx)); - if (CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(mdatetime, *value, date_sql_mode))) { - LOG_WARN("mdatetime_to_datetime fail", K(ret)); - } else { - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - } else { - *value = expr_vector->get_int(row_idx); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::INT96: - { - parquet::Int96* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (OB_FAIL(oracle_timestamp_to_int96(expr_vector, row_idx, datum_meta, *value))) { - LOG_WARN("failed to convert timestamp to int96", K(ret)); - } else { - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - default: - { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected type", K(p_node->physical_type()), K(ret)); - } - } - } - return ret; -} - -int ObSelectIntoOp::calc_parquet_decimal_array(const common::ObIVector* expr_vector, + std::string &res, + ObIAllocator &allocator) + { + int ret = OB_SUCCESS; + char *buf = NULL; + int64_t pos = 0; + if (OB_ISNULL(buf = static_cast(allocator.alloc(OB_CAST_TO_VARCHAR_MAX_LENGTH)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to alloc memory", K(ret)); + } else if (OB_FAIL(wide::to_string(datum.get_decimal_int(), datum.get_int_bytes(), datum_meta.scale_, + buf, OB_CAST_TO_VARCHAR_MAX_LENGTH, pos))) { + LOG_WARN("failed to get string", K(ret)); + } else { + res.assign(buf, pos); + } + return ret; + } + + + int ObSelectIntoOp::into_outfile_batch_csv(const ObBatchRows &brs, ObExternalFileWriter *data_writer) + { + int ret = OB_SUCCESS; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + ObArray datum_vectors; + ObDatum *datum = NULL; + ObObj obj; + ObDatumVector partition_datum_vector; + ObCsvFileWriter *csv_data_writer = NULL; + for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { + if (OB_FAIL(select_exprs.at(i)->eval_batch(eval_ctx_, *brs.skip_, brs.size_))) { + LOG_WARN("failed to eval batch", K(ret)); + } else if (OB_FAIL(datum_vectors.push_back(select_exprs.at(i)->locate_expr_datumvector(eval_ctx_)))) { + LOG_WARN("failed to push back datum vector", K(ret)); + } + } + + if (OB_SUCC(ret) && do_partition_) { + if (OB_FAIL(MY_SPEC.file_partition_expr_->eval_batch(eval_ctx_, *brs.skip_, brs.size_))) { + LOG_WARN("failed to eval batch", K(ret)); + } else { + partition_datum_vector = MY_SPEC.file_partition_expr_->locate_expr_datumvector(eval_ctx_); + } + } + for (int64_t i = 0; OB_SUCC(ret) && i < brs.size_; ++i) { + if (brs.skip_->contain(i)) { + // do nothing + } else if (do_partition_ && OB_ISNULL(partition_datum_vector.at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (do_partition_ && OB_FAIL(get_data_writer_for_partition(partition_datum_vector.at(i)->get_string(), + data_writer))) { + LOG_WARN("failed to set data writer for partition", K(ret)); + } else if (OB_ISNULL(csv_data_writer = static_cast(data_writer))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null data writer", K(ret)); + } else if (has_compress_ && OB_ISNULL(csv_data_writer->get_compress_stream_writer()) + && OB_FAIL(csv_data_writer->init_compress_writer(ctx_.get_allocator(), + external_properties_.csv_format_.compression_algorithm_, + MY_SPEC.buffer_size_))) { + LOG_WARN("failed to init compress stream writer", K(ret)); + } else { + for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); ++col_idx) { + if (OB_ISNULL(datum = datum_vectors.at(col_idx).at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("datum is unexpected null", K(ret)); + } else if (OB_FAIL(datum->to_obj(obj, + select_exprs.at(col_idx)->obj_meta_, + select_exprs.at(col_idx)->obj_datum_map_))) { + LOG_WARN("failed to get obj from datum", K(ret)); + } else if (!ob_is_text_tc(select_exprs.at(col_idx)->obj_meta_.get_type()) || obj.is_null()) { + OZ(print_field(obj, *csv_data_writer)); + } else { // text tc + OZ(print_lob_field(obj, *select_exprs.at(col_idx), *datum, *csv_data_writer)); + } + // print field terminator + if (OB_SUCC(ret) && col_idx != select_exprs.count() - 1) { + OZ(write_obj_to_file(field_str_, *csv_data_writer)); + } + } + // print line terminator + OZ(write_obj_to_file(line_str_, *csv_data_writer)); + // check if need split file + OZ(check_csv_file_size(*csv_data_writer)); + // clear shared buffer + OZ(csv_data_writer->flush_shared_buf(shared_buf_)); + if (has_compress_) { + OZ(csv_data_writer->flush_buf()); + } + } + } + return ret; + } + + int ObSelectIntoOp::get_data_from_expr_vector(const common::ObIVector* expr_vector, int row_idx, + ObObjType type, + int64_t &value, + const bool is_strict_mode, + const ObDateSqlMode date_sql_mode) + { + int ret = OB_SUCCESS; + int32_t date; + switch(type) { + case ObTinyIntType: + value = expr_vector->get_tinyint(row_idx); + break; + case ObSmallIntType: + value = expr_vector->get_smallint(row_idx); + break; + case ObMediumIntType: + value = expr_vector->get_mediumint(row_idx); + break; + case ObInt32Type: + value = expr_vector->get_int32(row_idx); + break; + case ObIntType: + value = expr_vector->get_int(row_idx); + break; + case ObYearType: + value = expr_vector->get_year(row_idx); + break; + case ObDateType: + value = expr_vector->get_date(row_idx); + break; + case ObMySQLDateType: + CAST_FAIL( + ObTimeConverter::mdate_to_date(expr_vector->get_mysql_date(row_idx), date, date_sql_mode)); + value = date; + break; + case ObMySQLDateTimeType: + CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(expr_vector->get_mysql_datetime(row_idx), value, + date_sql_mode)); + break; + default: + ret = OB_OBJ_TYPE_ERROR; + } + return ret; + } + + bool ObSelectIntoOp::file_need_split(int64_t file_size) + { + return (file_location_ == IntoFileLocation::SERVER_DISK + && !MY_SPEC.is_single_ && file_size > MY_SPEC.max_file_size_) + || (file_location_ != IntoFileLocation::SERVER_DISK + && ((!MY_SPEC.is_single_ && file_size > min(MY_SPEC.max_file_size_, MAX_OSS_FILE_SIZE)) + || (MY_SPEC.is_single_ && file_size > MAX_OSS_FILE_SIZE))); + } + + int ObSelectIntoOp::check_oracle_number(ObObjType obj_type, int16_t &precision, int8_t scale) + { + int ret = OB_SUCCESS; + return ret; + } + + int ObSelectIntoOp::calc_byte_array(const common::ObIVector* expr_vector, + int row_idx, + const ObDatumMeta &datum_meta, + const ObObjMeta &obj_meta, + ObIAllocator &allocator, + char* &buf, + uint32_t &res_len) + { + int ret = OB_SUCCESS; + ObString ob_str; + ObString res_str; + bool has_lob_header = obj_meta.has_lob_header(); + res_len = 0; + buf = nullptr; + int64_t buf_size = 0; + if (OB_FAIL(ObTextStringHelper::read_real_string_data(allocator, expr_vector, datum_meta, + has_lob_header, ob_str, row_idx))) { + LOG_WARN("failed to get string", K(ret)); + } else if (ob_str.length() == 0 || CS_TYPE_BINARY == datum_meta.cs_type_ + || CHARSET_UTF8MB4 == ObCharset::charset_type_by_coll(datum_meta.cs_type_)) { + if (OB_FAIL(ob_write_string(allocator, ob_str, res_str))) { + LOG_WARN("failed to write string", K(ret)); + } else { + res_len = static_cast(res_str.length()); + buf = const_cast(res_str.ptr()); + } + } else if (OB_FALSE_IT(buf_size = ob_str.length() * ObCharset::MAX_MB_LEN)) { + } else if (OB_ISNULL(buf = static_cast(allocator.alloc(buf_size)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to alloc memory", K(ret), K(buf_size)); + } else if (OB_FAIL(ObCharset::charset_convert(datum_meta.cs_type_, ob_str.ptr(), + ob_str.length(), CS_TYPE_UTF8MB4_BIN, + buf, buf_size, res_len, false, false))) { + LOG_WARN("failed to convert charset", K(ret)); + } + return ret; + } + + #ifndef OB_BUILD_EMBED_MODE + int ObSelectIntoOp::init_parquet_env() + { + int ret = OB_SUCCESS; + arrow_alloc_.init(MTL_ID()); + if (OB_FAIL(setup_parquet_schema())) { + LOG_WARN("failed to set up parquet schema", K(ret)); + } else if (OB_FAIL(init_env_common())) { + LOG_WARN("failed to init env common", K(ret)); + } + return ret; + } + + int ObSelectIntoOp::get_parquet_logical_type(std::shared_ptr &logical_type, + const ObObjType &obj_type, + const int32_t precision, + const int32_t scale) + { + int ret = OB_SUCCESS; + if (ObTinyIntType == obj_type) { + logical_type = parquet::LogicalType::Int(8, true); + } else if (ObSmallIntType == obj_type) { + logical_type = parquet::LogicalType::Int(16, true); + } else if (ObMediumIntType == obj_type || ObInt32Type == obj_type) { + logical_type = parquet::LogicalType::Int(32, true); + } else if (ObIntType == obj_type) { + logical_type = parquet::LogicalType::Int(64, true); + } else if (ObUTinyIntType == obj_type) { + logical_type = parquet::LogicalType::Int(8, false); + } else if (ObUSmallIntType == obj_type) { + logical_type = parquet::LogicalType::Int(16, false); + } else if (ObUMediumIntType == obj_type || ObUInt32Type == obj_type) { + logical_type = parquet::LogicalType::Int(32, false); + } else if (ObUInt64Type == obj_type) { + logical_type = parquet::LogicalType::Int(64, false); + } else if (ob_is_float_tc(obj_type) || ob_is_double_tc(obj_type)) { // float, ufloat, double, udouble + logical_type = parquet::LogicalType::None(); + } else if (ob_is_number_or_decimal_int_tc(obj_type)) { + logical_type = parquet::LogicalType::Decimal(precision, scale); + } else if (ob_is_datetime_or_mysql_datetime(obj_type)) { + logical_type = parquet::LogicalType::Timestamp(false, parquet::LogicalType::TimeUnit::MICROS); + } else if (ObTimestampType == obj_type) { + logical_type = parquet::LogicalType::Timestamp(true, parquet::LogicalType::TimeUnit::MICROS); + } else if (ob_is_date_or_mysql_date(obj_type)) { + logical_type = parquet::LogicalType::Date(); + } else if (ob_is_time_tc(obj_type)) { + logical_type = parquet::LogicalType::Time(false, parquet::LogicalType::TimeUnit::MICROS); + } else if (ob_is_year_tc(obj_type)) { + logical_type = parquet::LogicalType::Int(8, false); + } else if (ob_is_string_type(obj_type) || ObNullType == obj_type) { + logical_type = parquet::LogicalType::String(); + } else if (ob_is_bit_tc(obj_type) /*uint64_t*/) { + logical_type = parquet::LogicalType::Int(64, false); + } else if (ob_is_enum_or_set_type(obj_type) /*uint64_t*/) { + logical_type = parquet::LogicalType::Enum(); + } else { + // TODO(bitao): support json/bson/uuid/map/list + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "convert this ob type to parquet type"); + LOG_WARN("unsupported obj type", K(ret), K(obj_type)); + } + return ret; + } + + int ObSelectIntoOp::get_parquet_physical_type(parquet::Type::type &physical_type, + const ObObjType &obj_type) + { + int ret = OB_SUCCESS; + if (ObTinyIntType == obj_type || ObSmallIntType == obj_type + || ObMediumIntType == obj_type || ObInt32Type == obj_type + || ObUTinyIntType == obj_type || ObUSmallIntType == obj_type + || ObUMediumIntType == obj_type || ObUInt32Type == obj_type + || ob_is_date_or_mysql_date(obj_type) || ob_is_year_tc(obj_type)) { + physical_type = parquet::Type::INT32; + } else if (ObIntType == obj_type || ObUInt64Type == obj_type + || ob_is_datetime_or_mysql_datetime_tc(obj_type) + || ob_is_time_tc(obj_type) || ob_is_bit_tc(obj_type)) { + physical_type = parquet::Type::INT64; + } else if (ob_is_float_tc(obj_type)) { // float, ufloat + physical_type = parquet::Type::FLOAT; + } else if (ob_is_double_tc(obj_type)) { // double, udouble + physical_type = parquet::Type::DOUBLE; + } else if (ob_is_number_or_decimal_int_tc(obj_type)) { + physical_type = parquet::Type::FIXED_LEN_BYTE_ARRAY; + } else if (ob_is_string_tc(obj_type) /*varchar,char,varbinary,binary*/ + || ob_is_text_tc(obj_type) /*TinyText,MediumText,Text,LongText,TinyBLOB,MediumBLOB,BLOB,LongBLOB*/ + || ob_is_enum_or_set_type(obj_type) + || ObNullType == obj_type) { + physical_type = parquet::Type::BYTE_ARRAY; + } else { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "convert this ob type to parquet type"); + LOG_WARN("unsupported obj type", K(ret), K(obj_type)); + } + return ret; + } + + int ObSelectIntoOp::calc_parquet_decimal_length(int precision) + { + // Put in utils? + return std::ceil((1 + precision / std::log10(2)) / 8); + } + + int ObSelectIntoOp::setup_parquet_schema() + { + int ret = OB_SUCCESS; + ObMallocHookAttrGuard guard(ObMemAttr(MTL_ID(), "IntoParquet")); + parquet::schema::NodeVector fields; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + std::shared_ptr logical_type; + parquet::Type::type physical_type; + parquet::schema::NodePtr node; + try { + for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { + ObDatumMeta meta = select_exprs.at(i)->datum_meta_; + ObObjType obj_type = meta.get_type(); + ObString alias_name = MY_SPEC.alias_names_.strs_.at(i); + std::string column_name(alias_name.ptr(), alias_name.length()); + int primitive_length = -1; + if (OB_FAIL(check_oracle_number(obj_type, + select_exprs.at(i)->datum_meta_.precision_, + select_exprs.at(i)->datum_meta_.scale_))) { + LOG_WARN("not support number type", K(ret)); + } else if (OB_FAIL(get_parquet_logical_type(logical_type, + obj_type, + select_exprs.at(i)->datum_meta_.precision_, + select_exprs.at(i)->datum_meta_.scale_))) { + LOG_WARN("failed to get related logical type", K(ret)); + } else if (OB_FAIL(get_parquet_physical_type(physical_type, obj_type))) { + LOG_WARN("failed to get related physical type", K(ret)); + } else if (ob_is_number_or_decimal_int_tc(obj_type) + && OB_FALSE_IT(primitive_length = calc_parquet_decimal_length( + select_exprs.at(i)->datum_meta_.precision_))) { + } else { + //todo@linyi repetition level + node = parquet::schema::PrimitiveNode::Make(column_name, parquet::Repetition::OPTIONAL, + logical_type, physical_type, primitive_length); + fields.push_back(node); + } + } + if (OB_SUCC(ret)) { + parquet_writer_schema_ = std::static_pointer_cast( + parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + } + } catch (const std::exception& ex) { + if (OB_SUCC(ret)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("caught exception when setup parquet schema", K(ret), "Info", ex.what()); + LOG_USER_ERROR(OB_ERR_UNEXPECTED, ex.what()); + } + } catch (...) { + if (OB_SUCC(ret)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("caught exception when setup parquet schema", K(ret)); + } + } + return ret; + } + + int ObSelectIntoOp::into_outfile_batch_parquet(const ObBatchRows &brs, ObExternalFileWriter *data_writer) + { + int ret = OB_SUCCESS; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + ObArray expr_vectors; + common::ObIVector* partition_vector; + int64_t estimated_bytes = 0; + int64_t row_group_size = 0; + int64_t file_size = 0; + ObParquetFileWriter *parquet_data_writer = NULL; + ObSQLMode sql_mode = eval_ctx_.exec_ctx_.get_my_session()->get_sql_mode(); + ObDateSqlMode date_sql_mode; + date_sql_mode.init(sql_mode); + bool is_strict_mode = common::is_strict_mode(sql_mode); + for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { + if (OB_ISNULL(select_exprs.at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (OB_FAIL(select_exprs.at(i)->eval_vector(eval_ctx_, brs))) { + LOG_WARN("failed to eval vector", K(ret)); + } else if (OB_FAIL(expr_vectors.push_back(select_exprs.at(i)->get_vector(eval_ctx_)))) { + LOG_WARN("failed to push back vector", K(ret)); + } + } + if (OB_SUCC(ret) && do_partition_) { + if (OB_FAIL(MY_SPEC.file_partition_expr_->eval_vector(eval_ctx_, brs))) { + LOG_WARN("failed to eval batch", K(ret)); + } else if (OB_ISNULL(partition_vector = MY_SPEC.file_partition_expr_->get_vector(eval_ctx_))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null vector", K(ret)); + } + } + for (int64_t row_idx = 0; OB_SUCC(ret) && row_idx < brs.size_; ++row_idx) { + if (brs.skip_->contain(row_idx)) { + // do nothing + } else if (do_partition_ && OB_FAIL(get_data_writer_for_partition(partition_vector->get_string(row_idx), + data_writer))) { + LOG_WARN("failed to set data writer for partition", K(ret)); + } else if (OB_ISNULL(parquet_data_writer = static_cast(data_writer))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null data writer", K(ret)); + } else if (parquet_data_writer->is_file_writer_null() + && OB_FAIL(parquet_data_writer->open_parquet_file_writer(arrow_alloc_, + external_properties_.parquet_format_.row_group_size_, + external_properties_.parquet_format_.compress_type_index_, + brs.size_, + ctx_.get_allocator()))) { + LOG_WARN("failed to init parquet file writer", K(ret)); + } else if (!parquet_data_writer->is_valid_to_write()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else { + try { + for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); col_idx++) { + if (OB_FAIL(build_parquet_cell(parquet_data_writer->get_row_group_writer(), + select_exprs.at(col_idx)->datum_meta_, + select_exprs.at(col_idx)->obj_meta_, + expr_vectors.at(col_idx), + col_idx, + row_idx, + parquet_data_writer->get_row_batch_offset(), + parquet_data_writer->get_parquet_value_offsets().at(col_idx), + parquet_data_writer->get_parquet_row_def_levels().at(col_idx), + parquet_data_writer->get_batch_allocator(), + parquet_data_writer->get_parquet_row_batch().at(col_idx), + is_strict_mode, + date_sql_mode))) { + LOG_WARN("failed to build parquet cell", K(ret)); + } + } + parquet_data_writer->set_batch_written(false); + parquet_data_writer->increase_row_batch_offset(); + if (OB_FAIL(ret)) { + // discard unwritten data if an error occurs + parquet_data_writer->set_batch_written(true); + parquet_data_writer->reset_row_batch_offset(); + parquet_data_writer->reset_value_offsets(); + } else if (parquet_data_writer->reach_batch_end()) { + if (OB_FAIL(parquet_data_writer->write_file())) { + LOG_WARN("failed to write parquet row batch", K(ret)); + } else if (OB_FAIL(check_parquet_file_size(*parquet_data_writer))) { + LOG_WARN("failed to check parquet file size", K(ret)); + } + parquet_data_writer->set_batch_written(true); + parquet_data_writer->reset_row_batch_offset(); + parquet_data_writer->reset_value_offsets(); + } + } catch (const std::exception& ex) { + if (OB_SUCC(ret)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("caught exception when write parquet file", K(ret), "Info", ex.what()); + LOG_USER_ERROR(OB_ERR_UNEXPECTED, ex.what()); + } + } catch (...) { + if (OB_SUCC(ret)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("caught exception when write parquet file", K(ret)); + } + } + } + } + return ret; + } + + int ObSelectIntoOp::oracle_timestamp_to_int96(const common::ObIVector* expr_vector, + int64_t row_idx, const ObDatumMeta &datum_meta, - int parquet_decimal_length, - uint8_t* parquet_flba_ptr) -{ - int ret = OB_SUCCESS; - const ObDecimalInt* ob_decimal; - const uint8_t* decimal_bytes; - ObDecimalIntBuilder tmp_dec_alloc; - ObDecimalInt* tmp_decimal; - int ob_decimal_length = wide::ObDecimalIntConstValue::get_int_bytes_by_precision(datum_meta.precision_); - if (ob_is_decimal_int_tc(datum_meta.get_type())) { - ob_decimal = expr_vector->get_decimal_int(row_idx); - } else if (ob_is_number_tc(datum_meta.get_type())) { - number::ObNumber number(expr_vector->get_number(row_idx)); - if (OB_FAIL(wide::from_number_to_decimal_fixed_length(number, tmp_dec_alloc, datum_meta.scale_, - ob_decimal_length, tmp_decimal))){ - LOG_WARN("failed to case number to decimal int", K(ret)); - } else { - ob_decimal = tmp_decimal; - } - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected type", K(datum_meta.get_type())); - } - if (OB_FAIL(ret)) { - } else if (ob_decimal_length < parquet_decimal_length) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected decimal length", K(ob_decimal_length), K(parquet_decimal_length), K(ret)); - } else { - switch (ob_decimal_length) { - case sizeof(int32_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int32_v_); - break; - } - case sizeof(int64_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int64_v_); - break; - } - case sizeof(int128_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int128_v_); - break; - } - case sizeof(int256_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int256_v_); - break; - } - case sizeof(int512_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int512_v_); - break; - } - default: - { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected type", K(ob_decimal_length), K(ret)); - } - } - } - for (int i = 0; OB_SUCC(ret) && i < parquet_decimal_length; i++) { - parquet_flba_ptr[i] = decimal_bytes[parquet_decimal_length - i - 1]; - } - return ret; -} -#endif // !OB_BUILD_EMBED_MODE - -int ObSelectIntoOp::into_dumpfile(ObExternalFileWriter *data_writer) -{ - int ret = OB_SUCCESS; - char buf[MAX_VALUE_LENGTH]; - int64_t buf_len = MAX_VALUE_LENGTH; - int64_t pos = 0; - if (OB_ISNULL(data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (OB_FAIL(get_row_str(buf_len, is_first_, buf, pos))) { - LOG_WARN("get str failed", K(ret)); - } else if (is_first_) { // create file - if (OB_FAIL(data_writer->file_appender_.create(file_name_.get_varchar(), true))) { - LOG_WARN("create dumpfile failed", K(ret), K(file_name_)); - } else { - is_first_ = false; - } - } - if (OB_SUCC(ret)) { - if (OB_FAIL(data_writer->file_appender_.append(buf, pos, false))) { - LOG_WARN("failed to append file"); - } else { - //do nothing - } - } - return ret; -} - -int ObSelectIntoOp::into_varlist() -{ - int ret = OB_SUCCESS; - //before 4_1 use output - //after 4_1 use select exprs - const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? - MY_SPEC.output_ : MY_SPEC.select_exprs_; - const ObIArray &user_vars = MY_SPEC.user_vars_; - ObArenaAllocator lob_tmp_allocator("LobTmp", OB_MALLOC_NORMAL_BLOCK_SIZE, MTL_ID()); - if (select_exprs.count() != user_vars.count()) { - ret = OB_ERR_COLUMN_SIZE; - LOG_WARN("user vars count should be equal to select exprs count" , K(ret), - K(select_exprs.count()), K(user_vars.count())); - } else { - for (int i = 0 ; i < user_vars.count(); ++i) { - const ObString &var_name = user_vars.at(i); - ObObj obj; - ObDatum *datum = NULL; - if (OB_FAIL(select_exprs.at(i)->eval(eval_ctx_, datum))) { - LOG_WARN("eval expr failed", K(ret)); - } else if (OB_FAIL(datum->to_obj(obj, select_exprs.at(i)->obj_meta_))) { - LOG_WARN("convert datum to obj failed", K(ret), KPC(select_exprs.at(i))); - } else if (obj.is_lob_storage() - // outrow lob can not be assigned to user var, so convert outrow to inrow lob - // user var has independent memory, so using temporary memory here is fine - && OB_FAIL(ObTextStringIter::convert_outrow_lob_to_inrow_templob(obj, obj, nullptr, &lob_tmp_allocator, true/*allow_persist_inrow*/))) { - LOG_WARN("convert outrow to inrow lob failed", K(ret), K(obj)); - } else if (OB_FAIL(ObVariableSetExecutor::set_user_variable(obj, var_name, - ctx_.get_my_session()))) { - LOG_WARN("set user variable failed", K(ret)); - } - } - } - return ret; -} - -int ObSelectIntoOp::extract_fisrt_wchar_from_varhcar(const ObObj &obj, int32_t &wchar) -{ - int ret = OB_SUCCESS; - int32_t length = 0; - if (obj.is_varying_len_char_type()) { - ObString str = obj.get_varchar(); - if (str.length() > 0) { - ret = ObCharset::mb_wc(obj.get_collation_type(), str.ptr(), str.length(), length, wchar); - } - } - return ret; -} - -int ObSelectIntoOp::print_wchar_to_buf(char *buf, - const int64_t buf_len, - int64_t &pos, - int32_t wchar, - ObString &str, - ObCollationType coll_type) -{ - int ret = OB_SUCCESS; - int result_len = 0; - if (OB_FAIL(ObCharset::wc_mb(coll_type, wchar, buf + pos, buf_len - pos, result_len))) { - LOG_WARN("failed to convert wc to mb"); - } else { - str = ObString(result_len, buf + pos); - pos += result_len; - } - return ret; -} - -int ObSelectIntoOp::prepare_escape_printer() -{ - int ret = OB_SUCCESS; - int64_t pos = 0; - char *buf = NULL; - int64_t buf_len = 6 * ObCharset::MAX_MB_LEN; - // mb->wc - int32_t wchar_enclose = char_enclose_; - int32_t wchar_escape = char_escape_; - int32_t wchar_field = 0; - int32_t wchar_line = 0; - int32_t wchar_zero = '\0'; - int32_t wchar_replace = 0; - OZ(extract_fisrt_wchar_from_varhcar(field_str_, wchar_field)); - OZ(extract_fisrt_wchar_from_varhcar(line_str_, wchar_line)); - OZ(ObCharset::get_replace_character(cs_type_, wchar_replace)); - // wc->mb - if (OB_ISNULL(buf = static_cast(ctx_.get_allocator().alloc(buf_len)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate buffer", K(ret), K(buf_len)); - } - if (has_enclose_) { - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_enclose, escape_printer_.enclose_, cs_type_)); - } - if (has_escape_) { - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_escape, escape_printer_.escape_, cs_type_)); - } - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_zero, escape_printer_.zero_, cs_type_)); - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_field, escape_printer_.field_terminator_, cs_type_)); - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_line, escape_printer_.line_terminator_, cs_type_)); - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_replace, escape_printer_.convert_replacer_, cs_type_)); - escape_printer_.coll_type_ = cs_type_; - escape_printer_.ignore_convert_failed_ = true; // todo@linyi provide user-defined interface - return ret; -} - -int ObSelectIntoOp::check_has_lob_or_json() -{ - int ret = OB_SUCCESS; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - for (int64_t i = 0; OB_SUCC(ret) && (!has_lob_ || !has_json_ || !has_coll_) && i < select_exprs.count(); ++i) { - if (OB_ISNULL(select_exprs.at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("select expr is unexpected null", K(ret)); - } else if (ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type())) { - has_lob_ = true; - } else if (ob_is_json_tc(select_exprs.at(i)->obj_meta_.get_type())) { - has_json_ = true; - } else if (ob_is_collection_sql_type(select_exprs.at(i)->obj_meta_.get_type())) { - has_coll_ = true; - } - } - return ret; -} - -int ObSelectIntoOp::create_shared_buffer_for_data_writer() -{ - int ret = OB_SUCCESS; - shared_buf_len_ = has_lob_ ? (5 * SHARED_BUFFER_SIZE) : SHARED_BUFFER_SIZE; - if (OB_ISNULL(shared_buf_ = static_cast(ctx_.get_allocator().alloc(shared_buf_len_)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate buffer", K(ret), K(shared_buf_len_)); - } - if (OB_SUCC(ret) && (has_json_ || has_coll_) && has_escape_) { - json_buf_len_ = OB_MALLOC_MIDDLE_BLOCK_SIZE; - if (OB_ISNULL(json_buf_ = static_cast(ctx_.get_allocator().alloc(json_buf_len_)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate buffer", K(ret), K(json_buf_len_)); - } - } - return ret; -} - -int ObSelectIntoOp::check_secure_file_path(ObString file_name) -{ - int ret = OB_SUCCESS; - ObString file_path = file_name.split_on(file_name.reverse_find('/')); - char full_path_buf[PATH_MAX+1]; - char *actual_path = nullptr; - ObSqlString sql_str; - ObString secure_file_priv; - int64_t tenant_id = MTL_ID(); - if (OB_FAIL(sql_str.append(file_path.empty() ? "." : file_path))) { - LOG_WARN("failed to append string", K(ret)); -#ifdef _WIN32 - } else if (OB_ISNULL(actual_path = _fullpath(full_path_buf, sql_str.ptr(), PATH_MAX))) { -#else - } else if (OB_ISNULL(actual_path = realpath(sql_str.ptr(), full_path_buf))) { -#endif - ret = OB_FILE_NOT_EXIST; - LOG_WARN("file not exist", K(ret), K(sql_str)); - } else if (OB_FAIL(ObSchemaUtils::get_tenant_varchar_variable(tenant_id, - SYS_VAR_SECURE_FILE_PRIV, - ctx_.get_allocator(), - secure_file_priv))) { - LOG_WARN("fail get tenant variable", K(tenant_id), K(secure_file_priv), K(ret)); - } else if (OB_FAIL(ObResolverUtils::check_secure_path(secure_file_priv, actual_path))) { - LOG_WARN("failed to check secure path", K(ret), K(secure_file_priv)); - if (OB_ERR_NO_PRIVILEGE == ret) { - ret = OB_ERR_NO_PRIV_DIRECT_PATH_ACCESS; - LOG_ERROR("failed to check secure path", K(ret), K(secure_file_priv)); - } - } - return ret; -} - -int ObSelectIntoOp::get_data_writer_for_partition(const ObString &partition_str, - ObExternalFileWriter *&data_writer) -{ - int ret = OB_SUCCESS; - ObString partition; - ObExternalFileWriter *value = NULL; - ObCsvFileWriter *csv_data_writer = NULL; - if (OB_SUCC(partition_map_.get_refactored(partition_str, value))) { - if (OB_ISNULL(value)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else { - data_writer = value; - } - } else if (OB_UNLIKELY(OB_HASH_NOT_EXIST != ret)) { - LOG_WARN("get unexpected error", K(ret)); - } else if (curr_partition_num_ >= OB_MAX_PARTITION_NUM_ORACLE) { - ret = OB_TOO_MANY_PARTITIONS_ERROR; - LOG_WARN("too many partitions", K(ret)); - } else { - ret = OB_SUCCESS; - bool writer_added = false; - if (OB_FAIL(new_data_writer(data_writer))) { - LOG_WARN("failed to new data writer", K(ret)); - } else if (OB_ISNULL(data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { - csv_data_writer = static_cast(data_writer); - if (OB_FAIL(csv_data_writer->alloc_buf(ctx_.get_allocator(), MY_SPEC.buffer_size_))) { - LOG_WARN("failed to alloc buffer", K(ret)); - } - } - //add to hashmap - if (OB_FAIL(ret)) { - } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), - partition_str, - partition))) { - LOG_WARN("failed to write string", K(ret)); - } else if (OB_FAIL(partition_map_.set_refactored(partition, data_writer))) { - LOG_WARN("failed to add data writer to map", K(ret)); - } else { - curr_partition_num_++; - writer_added = true; - } - if (OB_FAIL(ret) && NULL != data_writer && !writer_added) { - data_writer->~ObExternalFileWriter(); - } - //calc file path - if (OB_SUCC(ret) && OB_FAIL(calc_file_path_with_partition(partition, *data_writer))) { - LOG_WARN("failed to calc file path with partition", K(ret)); - } - } - return ret; -} - -int ObSelectIntoOp::create_the_only_data_writer(ObExternalFileWriter *&data_writer) -{ - int ret = OB_SUCCESS; - ObCsvFileWriter *csv_data_writer = NULL; - if (OB_FAIL(new_data_writer(data_writer))) { - LOG_WARN("failed to new data writer", K(ret)); - } else if (OB_ISNULL(data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else { - data_writer->url_ = basic_url_; - data_writer_ = data_writer; - } - if (OB_FAIL(ret)) { - } else if (T_INTO_OUTFILE == MY_SPEC.into_type_ && MY_SPEC.is_single_ - && OB_FAIL(data_writer->open_file())) { - LOG_WARN("failed to open file", K(ret)); - } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { - csv_data_writer = static_cast(data_writer); - if (OB_FAIL(csv_data_writer->alloc_buf(ctx_.get_allocator(), MY_SPEC.buffer_size_))) { - LOG_WARN("failed to alloc buffer", K(ret)); - } - } - return ret; -} - -int ObSelectIntoOp::new_data_writer(ObExternalFileWriter *&data_writer) -{ - int ret = OB_SUCCESS; - void *ptr = NULL; - switch (format_type_) - { - case ObExternalFileFormat::FormatType::CSV_FORMAT: - { - if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObCsvFileWriter)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObCsvFileWriter))); - } else { - data_writer = new(ptr) ObCsvFileWriter(access_info_, file_location_, use_shared_buf_, - has_compress_, has_lob_, write_offset_); - } - break; - } - case ObExternalFileFormat::FormatType::PARQUET_FORMAT: - { -#ifndef OB_BUILD_EMBED_MODE - if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObParquetFileWriter)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObParquetFileWriter))); - } else { - data_writer = new(ptr) ObParquetFileWriter(access_info_, file_location_, parquet_writer_schema_); - } -#else - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet is not supported in embed mode", K(ret)); -#endif // OB_BUILD_EMBED_MODE - break; - } - case ObExternalFileFormat::FormatType::ORC_FORMAT: - { - ret = OB_NOT_SUPPORTED; - break; - } - default: - { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support select into type", K(format_type_)); - } - } - return ret; -} - -void ObSelectIntoOp::destroy() -{ - ObExternalFileWriter *data_writer = NULL; - if (do_partition_) { - for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); - iter != partition_map_.end(); iter++) { - if (OB_ISNULL(data_writer = iter->second)) { - } else { - data_writer->~ObExternalFileWriter(); - } - } - } else if (OB_NOT_NULL(data_writer_)) { - data_writer_->~ObExternalFileWriter(); - } -#ifndef OB_BUILD_EMBED_MODE - { - ObMallocHookAttrGuard guard(ObMemAttr(MTL_ID(), "IntoParquet")); - parquet_writer_schema_.reset(); - } -#endif - external_properties_.~ObExternalFileFormat(); - partition_map_.destroy(); - ObOperator::destroy(); -} - -#undef ARROW_FAIL -} -} -#undef CAST_FAIL + parquet::Int96 &res) + { + int ret = OB_SUCCESS; + int64_t out_usec = 0; + int32_t tmp_offset = 0; + ObOTimestampData oracle_timestamp; + uint32_t julian_date_value = (out_usec / 86400000000LL) + 2440588; + uint64_t nsec_time_value = oracle_timestamp.time_ctx_.tail_nsec_ + std::abs(out_usec % 86400000000LL) * 1000; + res.value[2] = julian_date_value; + res.value[1] = nsec_time_value >> 32; + res.value[0] = nsec_time_value & UINT32_MAX; + return ret; + } + + int ObSelectIntoOp::check_parquet_file_size(ObParquetFileWriter &data_writer) + { + int ret = OB_SUCCESS; + int64_t row_group_size = data_writer.get_row_group_size(); + int64_t file_size = data_writer.get_file_size(); + if (file_need_split(file_size)) { + if (OB_FAIL(split_file(data_writer))) { + LOG_WARN("failed to split file", K(ret)); + } else { + data_writer.set_write_bytes(0); + } + } else if (row_group_size > external_properties_.parquet_format_.row_group_size_) { + data_writer.get_row_group_writer()->Close(); + data_writer.set_write_bytes(file_size); + data_writer.open_next_row_group_writer(); + } + return ret; + } + + int ObSelectIntoOp::build_parquet_cell(parquet::RowGroupWriter* rg_writer, + const ObDatumMeta &datum_meta, + const ObObjMeta &obj_meta, + const common::ObIVector* expr_vector, + int64_t col_idx, + int64_t row_idx, + int64_t row_offset, + int64_t &value_offset, + int16_t* definition_levels, + ObIAllocator &allocator, + void* value_batch, + const bool is_strict_mode, + const ObDateSqlMode date_sql_mode) + { + int ret = OB_SUCCESS; + int16_t null_definition_level = 0; + int16_t normal_definition_level = 1; + std::shared_ptr p_node; + parquet::ColumnWriter *col_writer = nullptr; + if (OB_ISNULL(expr_vector) || !parquet_writer_schema_ || OB_ISNULL(rg_writer) + || OB_ISNULL(col_writer = rg_writer->column(col_idx)) + || OB_ISNULL(definition_levels) || OB_ISNULL(value_batch) + || !(p_node = std::static_pointer_cast(parquet_writer_schema_->field(col_idx)))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get null ptr", K(ret)); + } else if (p_node->is_group()) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "group type in parquet"); + LOG_WARN("not support group type in parquet", K(ret)); + } else { + switch (p_node->physical_type()) { + case parquet::Type::BYTE_ARRAY: + { + parquet::ByteArray* value = reinterpret_cast(value_batch); + value += value_offset; + char *buf = nullptr; + uint32_t res_len = 0; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (OB_FAIL(calc_byte_array(expr_vector, + row_idx, + datum_meta, + obj_meta, + allocator, + buf, + res_len))) { + LOG_WARN("failed to calc parquet byte array", K(ret)); + } else { + value->ptr = reinterpret_cast(buf); + value->len = res_len; + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::FIXED_LEN_BYTE_ARRAY: + { + parquet::FixedLenByteArray* value = reinterpret_cast(value_batch); + value += value_offset; + parquet::FixedLenByteArrayWriter *writer = static_cast(col_writer); + int parquet_decimal_length = writer->descr()->type_length(); + ObArrayWrap parquet_flba; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (OB_FAIL(parquet_flba.allocate_array(allocator, parquet_decimal_length))) { + LOG_WARN("failed to allocate array", K(ret)); + } else if (OB_FAIL(calc_parquet_decimal_array(expr_vector, + row_idx, + datum_meta, + parquet_decimal_length, + parquet_flba.get_data()))) { + LOG_WARN("failed to calc parquet decimal", K(ret)); + } else { + value->ptr = parquet_flba.get_data(); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::DOUBLE: + { + double* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else { + *value = expr_vector->get_double(row_idx); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::FLOAT: + { + float* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else { + *value = expr_vector->get_float(row_idx); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::INT32: + { + int32_t* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (ob_is_mysql_date_tc(datum_meta.type_)) { + ObMySQLDate mdate(expr_vector->get_int32(row_idx)); + if (CAST_FAIL(ObTimeConverter::mdate_to_date(mdate, *value, date_sql_mode))) { + LOG_WARN("mdate_to_date fail", K(ret)); + } else { + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + } else { + *value = expr_vector->get_int32(row_idx); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::INT64: + { + int64_t* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (ob_is_mysql_datetime(datum_meta.type_)) { + ObMySQLDateTime mdatetime(expr_vector->get_int(row_idx)); + if (CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(mdatetime, *value, date_sql_mode))) { + LOG_WARN("mdatetime_to_datetime fail", K(ret)); + } else { + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + } else { + *value = expr_vector->get_int(row_idx); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::INT96: + { + parquet::Int96* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (OB_FAIL(oracle_timestamp_to_int96(expr_vector, row_idx, datum_meta, *value))) { + LOG_WARN("failed to convert timestamp to int96", K(ret)); + } else { + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + default: + { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected type", K(p_node->physical_type()), K(ret)); + } + } + } + return ret; + } + + int ObSelectIntoOp::calc_parquet_decimal_array(const common::ObIVector* expr_vector, + int row_idx, + const ObDatumMeta &datum_meta, + int parquet_decimal_length, + uint8_t* parquet_flba_ptr) + { + int ret = OB_SUCCESS; + const ObDecimalInt* ob_decimal; + const uint8_t* decimal_bytes; + ObDecimalIntBuilder tmp_dec_alloc; + ObDecimalInt* tmp_decimal; + int ob_decimal_length = wide::ObDecimalIntConstValue::get_int_bytes_by_precision(datum_meta.precision_); + if (ob_is_decimal_int_tc(datum_meta.get_type())) { + ob_decimal = expr_vector->get_decimal_int(row_idx); + } else if (ob_is_number_tc(datum_meta.get_type())) { + number::ObNumber number(expr_vector->get_number(row_idx)); + if (OB_FAIL(wide::from_number_to_decimal_fixed_length(number, tmp_dec_alloc, datum_meta.scale_, + ob_decimal_length, tmp_decimal))){ + LOG_WARN("failed to case number to decimal int", K(ret)); + } else { + ob_decimal = tmp_decimal; + } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected type", K(datum_meta.get_type())); + } + if (OB_FAIL(ret)) { + } else if (ob_decimal_length < parquet_decimal_length) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected decimal length", K(ob_decimal_length), K(parquet_decimal_length), K(ret)); + } else { + switch (ob_decimal_length) { + case sizeof(int32_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int32_v_); + break; + } + case sizeof(int64_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int64_v_); + break; + } + case sizeof(int128_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int128_v_); + break; + } + case sizeof(int256_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int256_v_); + break; + } + case sizeof(int512_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int512_v_); + break; + } + default: + { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected type", K(ob_decimal_length), K(ret)); + } + } + } + for (int i = 0; OB_SUCC(ret) && i < parquet_decimal_length; i++) { + parquet_flba_ptr[i] = decimal_bytes[parquet_decimal_length - i - 1]; + } + return ret; + } + #endif // !OB_BUILD_EMBED_MODE + + int ObSelectIntoOp::into_dumpfile(ObExternalFileWriter *data_writer) + { + int ret = OB_SUCCESS; + char buf[MAX_VALUE_LENGTH]; + int64_t buf_len = MAX_VALUE_LENGTH; + int64_t pos = 0; + if (OB_ISNULL(data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (OB_FAIL(get_row_str(buf_len, is_first_, buf, pos))) { + LOG_WARN("get str failed", K(ret)); + } else if (is_first_) { // create file + if (OB_FAIL(data_writer->file_appender_.create(file_name_.get_varchar(), true))) { + LOG_WARN("create dumpfile failed", K(ret), K(file_name_)); + } else { + is_first_ = false; + } + } + if (OB_SUCC(ret)) { + if (OB_FAIL(data_writer->file_appender_.append(buf, pos, false))) { + LOG_WARN("failed to append file"); + } else { + //do nothing + } + } + return ret; + } + + int ObSelectIntoOp::into_varlist() + { + int ret = OB_SUCCESS; + //before 4_1 use output + //after 4_1 use select exprs + const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? + MY_SPEC.output_ : MY_SPEC.select_exprs_; + const ObIArray &user_vars = MY_SPEC.user_vars_; + ObArenaAllocator lob_tmp_allocator("LobTmp", OB_MALLOC_NORMAL_BLOCK_SIZE, MTL_ID()); + if (select_exprs.count() != user_vars.count()) { + ret = OB_ERR_COLUMN_SIZE; + LOG_WARN("user vars count should be equal to select exprs count" , K(ret), + K(select_exprs.count()), K(user_vars.count())); + } else { + for (int i = 0 ; i < user_vars.count(); ++i) { + const ObString &var_name = user_vars.at(i); + ObObj obj; + ObDatum *datum = NULL; + if (OB_FAIL(select_exprs.at(i)->eval(eval_ctx_, datum))) { + LOG_WARN("eval expr failed", K(ret)); + } else if (OB_FAIL(datum->to_obj(obj, select_exprs.at(i)->obj_meta_))) { + LOG_WARN("convert datum to obj failed", K(ret), KPC(select_exprs.at(i))); + } else if (obj.is_lob_storage() + // outrow lob can not be assigned to user var, so convert outrow to inrow lob + // user var has independent memory, so using temporary memory here is fine + && OB_FAIL(ObTextStringIter::convert_outrow_lob_to_inrow_templob(obj, obj, nullptr, &lob_tmp_allocator, true/*allow_persist_inrow*/))) { + LOG_WARN("convert outrow to inrow lob failed", K(ret), K(obj)); + } else if (OB_FAIL(ObVariableSetExecutor::set_user_variable(obj, var_name, + ctx_.get_my_session()))) { + LOG_WARN("set user variable failed", K(ret)); + } + } + } + return ret; + } + + int ObSelectIntoOp::extract_fisrt_wchar_from_varhcar(const ObObj &obj, int32_t &wchar) + { + int ret = OB_SUCCESS; + int32_t length = 0; + if (obj.is_varying_len_char_type()) { + ObString str = obj.get_varchar(); + if (str.length() > 0) { + ret = ObCharset::mb_wc(obj.get_collation_type(), str.ptr(), str.length(), length, wchar); + } + } + return ret; + } + + int ObSelectIntoOp::print_wchar_to_buf(char *buf, + const int64_t buf_len, + int64_t &pos, + int32_t wchar, + ObString &str, + ObCollationType coll_type) + { + int ret = OB_SUCCESS; + int result_len = 0; + if (OB_FAIL(ObCharset::wc_mb(coll_type, wchar, buf + pos, buf_len - pos, result_len))) { + LOG_WARN("failed to convert wc to mb"); + } else { + str = ObString(result_len, buf + pos); + pos += result_len; + } + return ret; + } + + int ObSelectIntoOp::prepare_escape_printer() + { + int ret = OB_SUCCESS; + int64_t pos = 0; + char *buf = NULL; + int64_t buf_len = 6 * ObCharset::MAX_MB_LEN; + // mb->wc + int32_t wchar_enclose = char_enclose_; + int32_t wchar_escape = char_escape_; + int32_t wchar_field = 0; + int32_t wchar_line = 0; + int32_t wchar_zero = '\0'; + int32_t wchar_replace = 0; + OZ(extract_fisrt_wchar_from_varhcar(field_str_, wchar_field)); + OZ(extract_fisrt_wchar_from_varhcar(line_str_, wchar_line)); + OZ(ObCharset::get_replace_character(cs_type_, wchar_replace)); + // wc->mb + if (OB_ISNULL(buf = static_cast(ctx_.get_allocator().alloc(buf_len)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate buffer", K(ret), K(buf_len)); + } + if (has_enclose_) { + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_enclose, escape_printer_.enclose_, cs_type_)); + } + if (has_escape_) { + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_escape, escape_printer_.escape_, cs_type_)); + } + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_zero, escape_printer_.zero_, cs_type_)); + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_field, escape_printer_.field_terminator_, cs_type_)); + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_line, escape_printer_.line_terminator_, cs_type_)); + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_replace, escape_printer_.convert_replacer_, cs_type_)); + escape_printer_.coll_type_ = cs_type_; + escape_printer_.ignore_convert_failed_ = true; // todo@linyi provide user-defined interface + return ret; + } + + int ObSelectIntoOp::check_has_lob_or_json() + { + int ret = OB_SUCCESS; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + for (int64_t i = 0; OB_SUCC(ret) && (!has_lob_ || !has_json_ || !has_coll_) && i < select_exprs.count(); ++i) { + if (OB_ISNULL(select_exprs.at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("select expr is unexpected null", K(ret)); + } else if (ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type())) { + has_lob_ = true; + } else if (ob_is_json_tc(select_exprs.at(i)->obj_meta_.get_type())) { + has_json_ = true; + } else if (ob_is_collection_sql_type(select_exprs.at(i)->obj_meta_.get_type())) { + has_coll_ = true; + } + } + return ret; + } + + int ObSelectIntoOp::create_shared_buffer_for_data_writer() + { + int ret = OB_SUCCESS; + shared_buf_len_ = has_lob_ ? (5 * SHARED_BUFFER_SIZE) : SHARED_BUFFER_SIZE; + if (OB_ISNULL(shared_buf_ = static_cast(ctx_.get_allocator().alloc(shared_buf_len_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate buffer", K(ret), K(shared_buf_len_)); + } + if (OB_SUCC(ret) && (has_json_ || has_coll_) && has_escape_) { + json_buf_len_ = OB_MALLOC_MIDDLE_BLOCK_SIZE; + if (OB_ISNULL(json_buf_ = static_cast(ctx_.get_allocator().alloc(json_buf_len_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate buffer", K(ret), K(json_buf_len_)); + } + } + return ret; + } + + int ObSelectIntoOp::check_secure_file_path(ObString file_name) + { + int ret = OB_SUCCESS; + ObString file_path = file_name.split_on(file_name.reverse_find('/')); + char full_path_buf[PATH_MAX+1]; + char *actual_path = nullptr; + ObSqlString sql_str; + ObString secure_file_priv; + int64_t tenant_id = MTL_ID(); + if (OB_FAIL(sql_str.append(file_path.empty() ? "." : file_path))) { + LOG_WARN("failed to append string", K(ret)); + #ifdef _WIN32 + } else if (OB_ISNULL(actual_path = _fullpath(full_path_buf, sql_str.ptr(), PATH_MAX))) { + #else + } else if (OB_ISNULL(actual_path = realpath(sql_str.ptr(), full_path_buf))) { + #endif + ret = OB_FILE_NOT_EXIST; + LOG_WARN("file not exist", K(ret), K(sql_str)); + } else if (OB_FAIL(ObSchemaUtils::get_tenant_varchar_variable(tenant_id, + SYS_VAR_SECURE_FILE_PRIV, + ctx_.get_allocator(), + secure_file_priv))) { + LOG_WARN("fail get tenant variable", K(tenant_id), K(secure_file_priv), K(ret)); + } else if (OB_FAIL(ObResolverUtils::check_secure_path(secure_file_priv, actual_path))) { + LOG_WARN("failed to check secure path", K(ret), K(secure_file_priv)); + if (OB_ERR_NO_PRIVILEGE == ret) { + ret = OB_ERR_NO_PRIV_DIRECT_PATH_ACCESS; + LOG_ERROR("failed to check secure path", K(ret), K(secure_file_priv)); + } + } + return ret; + } + + int ObSelectIntoOp::get_data_writer_for_partition(const ObString &partition_str, + ObExternalFileWriter *&data_writer) + { + int ret = OB_SUCCESS; + ObString partition; + ObExternalFileWriter *value = NULL; + ObCsvFileWriter *csv_data_writer = NULL; + if (OB_SUCC(partition_map_.get_refactored(partition_str, value))) { + if (OB_ISNULL(value)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else { + data_writer = value; + } + } else if (OB_UNLIKELY(OB_HASH_NOT_EXIST != ret)) { + LOG_WARN("get unexpected error", K(ret)); + } else if (curr_partition_num_ >= OB_MAX_PARTITION_NUM_ORACLE) { + ret = OB_TOO_MANY_PARTITIONS_ERROR; + LOG_WARN("too many partitions", K(ret)); + } else { + ret = OB_SUCCESS; + bool writer_added = false; + if (OB_FAIL(new_data_writer(data_writer))) { + LOG_WARN("failed to new data writer", K(ret)); + } else if (OB_ISNULL(data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { + csv_data_writer = static_cast(data_writer); + if (OB_FAIL(csv_data_writer->alloc_buf(ctx_.get_allocator(), MY_SPEC.buffer_size_))) { + LOG_WARN("failed to alloc buffer", K(ret)); + } + } + //add to hashmap + if (OB_FAIL(ret)) { + } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), + partition_str, + partition))) { + LOG_WARN("failed to write string", K(ret)); + } else if (OB_FAIL(partition_map_.set_refactored(partition, data_writer))) { + LOG_WARN("failed to add data writer to map", K(ret)); + } else { + curr_partition_num_++; + writer_added = true; + } + if (OB_FAIL(ret) && NULL != data_writer && !writer_added) { + data_writer->~ObExternalFileWriter(); + } + //calc file path + if (OB_SUCC(ret) && OB_FAIL(calc_file_path_with_partition(partition, *data_writer))) { + LOG_WARN("failed to calc file path with partition", K(ret)); + } + } + return ret; + } + + int ObSelectIntoOp::create_the_only_data_writer(ObExternalFileWriter *&data_writer) + { + int ret = OB_SUCCESS; + ObCsvFileWriter *csv_data_writer = NULL; + if (OB_FAIL(new_data_writer(data_writer))) { + LOG_WARN("failed to new data writer", K(ret)); + } else if (OB_ISNULL(data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else { + data_writer->url_ = basic_url_; + data_writer_ = data_writer; + } + if (OB_FAIL(ret)) { + } else if (T_INTO_OUTFILE == MY_SPEC.into_type_ && MY_SPEC.is_single_ + && OB_FAIL(data_writer->open_file())) { + LOG_WARN("failed to open file", K(ret)); + } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { + csv_data_writer = static_cast(data_writer); + if (OB_FAIL(csv_data_writer->alloc_buf(ctx_.get_allocator(), MY_SPEC.buffer_size_))) { + LOG_WARN("failed to alloc buffer", K(ret)); + } + } + return ret; + } + + int ObSelectIntoOp::new_data_writer(ObExternalFileWriter *&data_writer) + { + int ret = OB_SUCCESS; + void *ptr = NULL; + switch (format_type_) + { + case ObExternalFileFormat::FormatType::CSV_FORMAT: + { + if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObCsvFileWriter)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObCsvFileWriter))); + } else { + data_writer = new(ptr) ObCsvFileWriter(access_info_, file_location_, use_shared_buf_, + has_compress_, has_lob_, write_offset_); + } + break; + } + case ObExternalFileFormat::FormatType::PARQUET_FORMAT: + { + #ifdef OB_BUILD_EMBED_MODE + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet not supported in embed mode", K(ret)); + #else + if (lib::is_embed_mode()) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet not supported in embed mode", K(ret)); + } else if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObParquetFileWriter)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObParquetFileWriter))); + } else { + data_writer = new(ptr) ObParquetFileWriter(access_info_, file_location_, parquet_writer_schema_); + } + #endif + break; + } + case ObExternalFileFormat::FormatType::ORC_FORMAT: + { + ret = OB_NOT_SUPPORTED; + break; + } + default: + { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support select into type", K(format_type_)); + } + } + return ret; + } + + void ObSelectIntoOp::destroy() + { + ObExternalFileWriter *data_writer = NULL; + if (do_partition_) { + for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); + iter != partition_map_.end(); iter++) { + if (OB_ISNULL(data_writer = iter->second)) { + } else { + data_writer->~ObExternalFileWriter(); + } + } + } else if (OB_NOT_NULL(data_writer_)) { + data_writer_->~ObExternalFileWriter(); + } + #ifndef OB_BUILD_EMBED_MODE + { + ObMallocHookAttrGuard guard(ObMemAttr(MTL_ID(), "IntoParquet")); + parquet_writer_schema_.reset(); + } + #endif + external_properties_.~ObExternalFileFormat(); + partition_map_.destroy(); + ObOperator::destroy(); + } + + #undef ARROW_FAIL + } + } + #undef CAST_FAIL + \ No newline at end of file diff --git a/src/sql/engine/table/ob_external_table_access_service.cpp b/src/sql/engine/table/ob_external_table_access_service.cpp index 67c0749d3..bb61fd774 100644 --- a/src/sql/engine/table/ob_external_table_access_service.cpp +++ b/src/sql/engine/table/ob_external_table_access_service.cpp @@ -14,857 +14,858 @@ * limitations under the License. */ -#define USING_LOG_PREFIX SQL - -#include "ob_external_table_access_service.h" -#include "share/backup/ob_backup_io_adapter.h" -#include "share/external_table/ob_external_table_utils.h" -#include "share/ob_device_manager.h" -#ifndef OB_BUILD_EMBED_MODE -#include "sql/engine/table/ob_parquet_table_row_iter.h" -#include "sql/engine/table/ob_orc_table_row_iter.h" -#endif -#include "sql/engine/cmd/ob_load_data_file_reader.h" -#include "sql/engine/table/ob_csv_table_row_iter.h" -#include "sql/engine/expr/ob_expr_regexp_context.h" -#include "share/config/ob_server_config.h" - -namespace oceanbase -{ -namespace common { -extern const char *OB_STORAGE_ACCESS_TYPES_STR[]; -} - -namespace share -{ -struct ObExternalTablePartInfo; -class ObExternalTablePartInfoArray; -} -using namespace share::schema; -using namespace common; -using namespace share; -namespace sql -{ - -static constexpr uint64_t OB_STORAGE_ID_EXTERNAL = 2001; - -ObExternalDataAccessDriver::~ObExternalDataAccessDriver() { - close(); - if (OB_NOT_NULL(device_handle_)) { - ObDeviceManager::get_instance().release_device(device_handle_); - } -} - -void ObExternalDataAccessDriver::close() -{ - if (OB_NOT_NULL(device_handle_) && fd_.is_valid()) { - int ret = OB_SUCCESS; - if (OB_FAIL(ObBackupIoAdapter::close_device_and_fd(device_handle_, fd_))) { - LOG_WARN("fail to close device and fd", KR(ret), K_(fd), KP_(device_handle)); - } - } -} - -bool ObExternalDataAccessDriver::is_opened() const -{ - return fd_.is_valid(); -} - -int ObExternalDataAccessDriver::get_file_size(const ObString &url, int64_t &file_size) -{ - int ret = OB_SUCCESS; - file_size = -1; - CONSUMER_GROUP_FUNC_GUARD(ObFunctionType::PRIO_IMPORT); - ObString url_cstring; - ObArenaAllocator allocator; - - if (OB_FAIL(ob_write_string(allocator, url, url_cstring, true/*c_style*/))) { - LOG_WARN("fail to copy string", KR(ret), K(url)); - } else if (OB_FAIL(ObBackupIoAdapter::get_file_length(url_cstring, access_info_, file_size))) { - LOG_WARN("fail to get file length", KR(ret), K(url_cstring), K_(access_info)); - } - - if (OB_OBJECT_NOT_EXIST == ret || OB_IO_ERROR == ret) { - file_size = -1; - ret = OB_SUCCESS; - } - return ret; -} - -int ObExternalDataAccessDriver::open(const char *url) -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(is_opened())) { - ret = OB_INIT_TWICE; - LOG_WARN("Data Access Driver has been opened", KR(ret), K(url)); - } else if (OB_FAIL(ObBackupIoAdapter::open_with_access_type( - device_handle_, fd_, access_info_, url, OB_STORAGE_ACCESS_READER, - ObStorageIdMod(OB_STORAGE_ID_EXTERNAL, ObStorageUsedMod::STORAGE_USED_EXTERNAL)))) { - LOG_WARN("fail to open Data Access Driver", KR(ret), K_(access_info), K(url)); - } - return ret; -} - -int ObExternalDataAccessDriver::pread(void *buf, const int64_t count, const int64_t offset, int64_t &read_size) -{ - int ret = OB_SUCCESS; - ObIOHandle io_handle; - CONSUMER_GROUP_FUNC_GUARD(PRIO_IMPORT); - if (OB_FAIL(ObBackupIoAdapter::async_pread(*device_handle_, fd_, - static_cast(buf), offset, count, io_handle))) { - LOG_WARN("fail to async pread", KR(ret), - KP_(device_handle), K_(fd), KP(buf), K(offset), K(count)); - } else if (OB_FAIL(io_handle.wait())) { - LOG_WARN("fail to wait pread result", KR(ret), - KP_(device_handle), K_(fd), KP(buf), K(offset), K(count)); - } else { - read_size = io_handle.get_data_size(); - } - return ret; -} - -class ObExternalFileListArrayOpWithFilter : public ObBaseDirEntryOperator -{ -public: - ObExternalFileListArrayOpWithFilter(ObIArray & name_array, - ObIArray & file_size, - ObExternalPathFilter *filter, - ObIAllocator& array_allocator) - : name_array_(name_array), file_size_(file_size), filter_(filter), allocator_(array_allocator) {} - - virtual bool need_get_file_size() const override { return true; } - int func(const dirent *entry) { - int ret = OB_SUCCESS; - if (OB_ISNULL(entry)) { - ret = OB_INVALID_ARGUMENT; - OB_LOG(WARN, "invalid list entry, entry is null"); - } else if (OB_ISNULL(entry->d_name)) { - ret = OB_INVALID_ARGUMENT; - OB_LOG(WARN, "invalid list entry, d_name is null"); - } else { - const ObString file_name(entry->d_name); - ObString tmp_file; - bool is_filtered = false; - if (!file_name.empty() && file_name[file_name.length() - 1] != '/') { - if (OB_NOT_NULL(filter_) && OB_FAIL(filter_->is_filtered(file_name, is_filtered))) { - LOG_WARN("fail check is filtered", K(ret)); - } else if (!is_filtered) { - if (OB_FAIL(ob_write_string(allocator_, file_name, tmp_file, true))) { - OB_LOG(WARN, "fail to save file name", K(ret), K(file_name)); - } else if (OB_FAIL(name_array_.push_back(tmp_file))) { - OB_LOG(WARN, "fail to push filename to array", K(ret), K(tmp_file)); - } else if (OB_FAIL(file_size_.push_back(get_size()))) { - OB_LOG(WARN, "fail to push size to array", K(ret), K(tmp_file)); - } - } - } - } - return ret; - } - -private: - ObIArray & name_array_; - ObIArray & file_size_; - ObExternalPathFilter *filter_; - ObIAllocator& allocator_; -}; - -class ObLocalFileListArrayOpWithFilter : public ObBaseDirEntryOperator -{ -public: - ObLocalFileListArrayOpWithFilter(ObIArray &name_array, - ObIArray & file_size, - const ObString &path, - const ObString &origin_path, - ObExternalPathFilter *filter, - ObIAllocator &array_allocator) - : name_array_(name_array), file_size_(file_size), path_(path), origin_path_(origin_path), - filter_(filter), allocator_(array_allocator) {} - virtual bool need_get_file_size() const override { return true; } - int func(const dirent *entry) - { - int ret = OB_SUCCESS; - if (OB_ISNULL(entry)) { - ret = OB_INVALID_ARGUMENT; - OB_LOG(WARN, "invalid list entry, entry is null"); - } else if (OB_ISNULL(entry->d_name)) { - ret = OB_INVALID_ARGUMENT; - OB_LOG(WARN, "invalid list entry, d_name is null"); - } else { - const ObString file_name(entry->d_name); - ObSqlString full_path; - ObString tmp_file; - bool is_filtered = false; - ObString cur_path = path_; - if (file_name.case_compare(".") == 0 - || file_name.case_compare("..") == 0) { - //do nothing - } else if (OB_FAIL(full_path.assign(cur_path))) { - OB_LOG(WARN, "assign string failed", K(ret)); - } else if (full_path.length() > 0 && *(full_path.ptr() + full_path.length() - 1) != '/' && - OB_FAIL(full_path.append("/"))) { - OB_LOG(WARN, "append failed", K(ret)) ; - } else if (OB_FAIL(full_path.append(file_name))) { - OB_LOG(WARN, "append file name failed", K(ret)); - } else if (OB_NOT_NULL(filter_) && OB_FAIL(filter_->is_filtered(full_path.string(), is_filtered))) { - LOG_WARN("fail check is filtered", K(ret)); - } else if (!is_filtered) { - ObString target = full_path.string(); - if (!is_dir_scan()) { - target += origin_path_.length(); - if (!target.empty() && '/' == target[0]) { - target += 1; - } - } - if (target.empty()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("empty dir or name", K(full_path), K(origin_path_)); - } else if (OB_FAIL(ob_write_string(allocator_, target, tmp_file, true/*c_style*/))) { - OB_LOG(WARN, "fail to save file name", K(ret), K(file_name)); - } else if (OB_FAIL(name_array_.push_back(tmp_file))) { - OB_LOG(WARN, "fail to push filename to array", K(ret), K(tmp_file)); - } else if (OB_FAIL(file_size_.push_back(get_size()))) { - OB_LOG(WARN, "fail to push size to array", K(ret), K(tmp_file)); - } - } - } - return ret; - } -private: - ObIArray &name_array_; - ObIArray &file_size_; - const ObString &path_; - const ObString &origin_path_; - ObExternalPathFilter *filter_; - ObIAllocator &allocator_; -}; - - -int ObExternalDataAccessDriver::get_file_list(const ObString &path, - const ObString &pattern, - const ObExprRegexpSessionVariables ®exp_vars, - ObIArray &file_urls, - ObIArray &file_sizes, - ObIAllocator &allocator) -{ - int ret = OB_SUCCESS; - const int64_t MAX_VISIT_COUNT = 100000; - ObExprRegexContext regexp_ctx; - ObExternalPathFilter filter(regexp_ctx, allocator); - ObString path_cstring; - CONSUMER_GROUP_FUNC_GUARD(PRIO_IMPORT); - - if (OB_UNLIKELY(!access_info_->is_valid())) { - ret = OB_NOT_INIT; - LOG_WARN("ObExternalDataAccessDriver not init", KR(ret), K_(access_info)); - } else if (!pattern.empty() && OB_FAIL(filter.init(pattern, regexp_vars))) { - LOG_WARN("fail to init filter", K(ret)); - } else if (OB_FAIL(ob_write_string(allocator, path, path_cstring, true/*c_style*/))) { - LOG_WARN("fail to copy string", KR(ret), K(path)); - } else if (get_storage_type() == OB_STORAGE_FILE) { - ObSEArray file_dirs; - bool is_dir = false; - - if (get_storage_type() == OB_STORAGE_FILE) { - ObString path_without_prifix; - path_without_prifix = path_cstring; - path_without_prifix += strlen(OB_FILE_PREFIX); - - OZ(FileDirectoryUtils::is_directory(path_without_prifix.ptr(), is_dir)); - if (!is_dir) { - LOG_INFO("external location is not a directory", - K(path_without_prifix)); - } else { - OZ(file_dirs.push_back(path_cstring)); - } - } - - ObArray useless_size; - for (int64_t i = 0; OB_SUCC(ret) && i < file_dirs.count(); i++) { - ObString file_dir = file_dirs.at(i); - ObLocalFileListArrayOpWithFilter dir_op(file_dirs, useless_size, file_dir, path_cstring, NULL, allocator); - ObLocalFileListArrayOpWithFilter file_op(file_urls, file_sizes, file_dir, path_cstring, - pattern.empty() ? NULL : &filter, allocator); - dir_op.set_dir_flag(); - if (OB_FAIL(ObBackupIoAdapter::list_files(file_dir, access_info_, file_op))) { - LOG_WARN("fail to list files", KR(ret), K(file_dir), K_(access_info)); - } else if (OB_FAIL(ObBackupIoAdapter::list_directories(file_dir, access_info_, dir_op))) { - LOG_WARN("fail to list dirs", KR(ret), K(file_dir), K_(access_info)); - } else if (file_dirs.count() + file_urls.count() > MAX_VISIT_COUNT) { - ret = OB_SIZE_OVERFLOW; - LOG_WARN("too many files and dirs to visit", K(ret)); - } - } - } else { - ObExternalFileListArrayOpWithFilter file_op(file_urls, file_sizes, pattern.empty() ? NULL : &filter, allocator); - if (OB_FAIL(ObBackupIoAdapter::list_files(path_cstring, access_info_, file_op))) { - LOG_WARN("fail to list files", KR(ret), K(path_cstring), K_(access_info)); - } - } - return ret; -} - -int ObExternalDataAccessDriver::init(const ObString &location, const ObString &access_info) -{ - int ret = OB_SUCCESS; - ObStorageType device_type = OB_STORAGE_MAX_TYPE; - ObArenaAllocator temp_allocator; - ObString location_cstr; - ObString access_info_cstr; - ObBackupIoAdapter util; - - if (OB_FAIL(get_storage_type_from_path(location, device_type))) { - LOG_WARN("fail to resove storage type", K(ret)); - } else { - storage_type_ = device_type; - // Note: if device type is file, the storage info is empty. - if (device_type == OB_STORAGE_FILE || - (OB_ISNULL(access_info) || OB_LIKELY(0 == access_info.length()))) { - OZ(ob_write_string(temp_allocator, location, location_cstr, true)); - access_info_cstr.assign_ptr(&dummy_empty_char, static_cast(strlen(&dummy_empty_char))); - } else { - OZ (ob_write_string(temp_allocator, location, location_cstr, true)); - OZ (ob_write_string(temp_allocator, access_info, access_info_cstr, true)); - } - } - access_info_ = &backup_storage_info_; - if (OB_ISNULL(access_info_)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("failed to get access into", K(ret), K(device_type), K(access_info_cstr)); - } - LOG_TRACE("resolve storage into", K(ret), K(device_type), K(access_info_cstr)); - OZ (access_info_->set(device_type, access_info_cstr.ptr())); - - return ret; -} - -ObExternalStreamFileReader::~ObExternalStreamFileReader() -{ - reset(); -} - -const char * ObExternalStreamFileReader::MEMORY_LABEL = "ExternalReader"; -const int64_t ObExternalStreamFileReader::COMPRESSED_DATA_BUFFER_SIZE = 2 * 1024 * 1024; - -int ObExternalStreamFileReader::init(const common::ObString &location, - const ObString &access_info, - ObCSVGeneralFormat::ObCSVCompression compression_format, - ObIAllocator &allocator) -{ - int ret = OB_SUCCESS; - if (OB_NOT_NULL(allocator_)) { - ret = OB_INIT_TWICE; - } else if (OB_FAIL(data_access_driver_.init(location, access_info))) { - LOG_WARN("failed to init data access driver", K(ret), K(location), K(access_info)); - } else { - allocator_ = &allocator; - compression_format_ = compression_format; - } - return ret; -} - -int ObExternalStreamFileReader::open(const ObString &filename) -{ - int ret = OB_SUCCESS; - if (OB_ISNULL(allocator_)) { - ret = OB_NOT_INIT; - } else if (data_access_driver_.is_opened()) { - ret = OB_INIT_TWICE; - } else if (OB_FAIL(data_access_driver_.open(filename.ptr()))) { - LOG_WARN("failed to open file", K(ret), K(filename)); - } else if (OB_FAIL(data_access_driver_.get_file_size(filename.ptr(), file_size_))) { - LOG_WARN("failed to get file size", K(ret), K(filename)); - } else { - is_file_end_ = false; - - ObCSVGeneralFormat::ObCSVCompression this_file_compression_format = compression_format_; - if (this_file_compression_format == ObCSVGeneralFormat::ObCSVCompression::AUTO - && OB_FAIL(compression_algorithm_from_suffix(filename, this_file_compression_format))) { - LOG_WARN("failed to dectect compression format from filename", K(ret), K(filename)); - } - - if (OB_SUCC(ret) && OB_FAIL(create_decompressor(this_file_compression_format))) { - LOG_WARN("failed to create decompressor", K(ret)); - } - } - - LOG_TRACE("open file done", K(filename), K(ret)); - return ret; -} - -void ObExternalStreamFileReader::close() -{ - if (data_access_driver_.is_opened()) { - data_access_driver_.close(); - - is_file_end_ = true; - file_offset_ = 0; - file_size_ = 0; - LOG_DEBUG("close file"); - } -} - -void ObExternalStreamFileReader::reset() -{ - close(); - if (OB_NOT_NULL(compressed_data_) && OB_NOT_NULL(allocator_)) { - allocator_->free(compressed_data_); - compressed_data_ = nullptr; - } - - if (OB_NOT_NULL(decompressor_)) { - ObDecompressor::destroy(decompressor_); - decompressor_ = nullptr; - } - - allocator_ = nullptr; -} - -bool ObExternalStreamFileReader::eof() -{ - return is_file_end_; -} - -int ObExternalStreamFileReader::read(char *buf, int64_t buf_len, int64_t &read_size) -{ - int ret = OB_SUCCESS; - read_size = 0; - - if (OB_ISNULL(buf) || buf_len <= 0) { - ret = OB_INVALID_ARGUMENT; - } else if (OB_ISNULL(decompressor_)) { - ret = read_from_driver(buf, buf_len, read_size); - is_file_end_ = file_offset_ >= file_size_; - LOG_DEBUG("read file", K(is_file_end_), K(file_offset_), K(file_size_), K(read_size)); - } else { - ret = read_decompress(buf, buf_len, read_size); - is_file_end_ = (file_offset_ >= file_size_) && (consumed_data_size_ >= compress_data_size_); - } - return ret; -} - -int ObExternalStreamFileReader::read_from_driver(char *buf, int64_t buf_len, int64_t &read_size) -{ - int ret = OB_SUCCESS; - read_size = 0; - - if (OB_ISNULL(buf) || buf_len <= 0) { - ret = OB_INVALID_ARGUMENT; - } else if(OB_FAIL(data_access_driver_.pread(buf, buf_len, file_offset_, read_size))) { - LOG_WARN("failed to read data from data access driver", K(ret), K(file_offset_), K(buf_len)); - } else { - file_offset_ += read_size; - } - return ret; -} - -int ObExternalStreamFileReader::read_decompress(char *buf, int64_t buf_len, int64_t &read_size) -{ - int ret = OB_SUCCESS; - read_size = 0; - - if (!data_access_driver_.is_opened()) { - ret = OB_NOT_INIT; - } else if (OB_ISNULL(buf) || buf_len <= 0) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KP(buf), K(buf_len)); - } else if (consumed_data_size_ >= compress_data_size_) { - if (file_offset_ < file_size_) { - ret = read_compressed_data(); - } else { - is_file_end_ = true; - } - } - - if (OB_SUCC(ret) && compress_data_size_ > consumed_data_size_) { - int64_t consumed_size = 0; - ret = decompressor_->decompress(compressed_data_ + consumed_data_size_, - compress_data_size_ - consumed_data_size_, - consumed_size, - buf, - buf_len, - read_size); - if (OB_FAIL(ret)) { - LOG_WARN("failed to decompress", K(ret)); - } else { - consumed_data_size_ += consumed_size; - uncompressed_size_ += read_size; - } - } - return ret; -} - -int ObExternalStreamFileReader::read_compressed_data() -{ - int ret = OB_SUCCESS; - char *read_buffer = compressed_data_; - if (!data_access_driver_.is_opened()) { - ret = OB_NOT_INIT; - } else if (OB_UNLIKELY(consumed_data_size_ < compress_data_size_)) { - // backup data - const int64_t last_data_size = compress_data_size_ - consumed_data_size_; - MEMMOVE(compressed_data_, compressed_data_ + consumed_data_size_, last_data_size); - read_buffer = compressed_data_ + last_data_size; - consumed_data_size_ = 0; - compress_data_size_ = last_data_size; - } else if (consumed_data_size_ == compress_data_size_) { - consumed_data_size_ = 0; - compress_data_size_ = 0; - } - - if (OB_SUCC(ret)) { - // read data from source reader - int64_t read_size = 0; - int64_t capacity = COMPRESSED_DATA_BUFFER_SIZE - compress_data_size_; - ret = read_from_driver(read_buffer, capacity, read_size); - if (OB_SUCC(ret)) { - compress_data_size_ += read_size; - } - } - return ret; -} - -int ObExternalStreamFileReader::create_decompressor(ObCSVGeneralFormat::ObCSVCompression compression_format) -{ - int ret = OB_SUCCESS; - if (OB_ISNULL(allocator_)) { - ret = OB_NOT_INIT; - } else if (compression_format == ObCSVGeneralFormat::ObCSVCompression::NONE) { - ObDecompressor::destroy(decompressor_); - decompressor_ = nullptr; - } else if (OB_NOT_NULL(decompressor_) && decompressor_->compression_format() == compression_format) { - // do nothing - } else { - if (OB_NOT_NULL(decompressor_)) { - ObDecompressor::destroy(decompressor_); - decompressor_ = nullptr; - } - - if (OB_FAIL(ObDecompressor::create(compression_format, *allocator_, decompressor_))) { - LOG_WARN("failed to create decompressor", K(ret)); - } else if (OB_ISNULL(compressed_data_) && - OB_ISNULL(compressed_data_ = (char *)allocator_->alloc(COMPRESSED_DATA_BUFFER_SIZE))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate memory", K(COMPRESSED_DATA_BUFFER_SIZE)); - } - } - return ret; -} - -int ObExternalTableAccessService::table_scan( - ObVTableScanParam ¶m, - ObNewRowIterator *&result) -{ - ACTIVE_SESSION_FLAG_SETTER_GUARD(in_storage_read); - const share::ObLSID &ls_id = param.ls_id_; - common::ObASHTabletIdSetterGuard ash_tablet_id_guard(param.tablet_id_.id()); - ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(ls_id_, ls_id.id()); - ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(tablet_id_, param.tablet_id_.id()); - int ret = OB_SUCCESS; - ObExternalTableRowIterator* row_iter = NULL; - - auto &scan_param = static_cast(param); - - switch (param.external_file_format_.format_type_) { - case ObExternalFileFormat::CSV_FORMAT: - if (OB_ISNULL(row_iter = OB_NEWx(ObCSVTableRowIterator, (scan_param.allocator_)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("alloc memory failed", K(ret)); - } - break; - case ObExternalFileFormat::PARQUET_FORMAT: -#ifndef OB_BUILD_EMBED_MODE - if (OB_ISNULL(row_iter = OB_NEWx(ObParquetTableRowIterator, (scan_param.allocator_)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("alloc memory failed", K(ret)); - } -#else - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet is not supported in embed mode", K(ret)); -#endif // OB_BUILD_EMBED_MODE - break; - case ObExternalFileFormat::ODPS_FORMAT: - if (!GCONF._use_odps_jni_connector) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps cpp connector is not enabled", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps jni connector is not enabled", K(ret)); - } - break; - case ObExternalFileFormat::ORC_FORMAT: - ret = OB_NOT_SUPPORTED; - break; - default: - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected format", K(ret), "format", param.external_file_format_.format_type_); - } - - if (OB_SUCC(ret)) { - if (OB_FAIL(row_iter->init(&scan_param))) { - row_iter->~ObExternalTableRowIterator(); - LOG_WARN("fail to open iter", K(ret)); - } else { - result = row_iter; - } - } - - LOG_DEBUG("external table access service iter init", K(ret), "type", param.external_file_format_.format_type_); - - return ret; -} - -int ObExternalTableAccessService::table_rescan(ObVTableScanParam ¶m, ObNewRowIterator *result) -{ - ACTIVE_SESSION_FLAG_SETTER_GUARD(in_storage_read); - const share::ObLSID &ls_id = param.ls_id_; - common::ObASHTabletIdSetterGuard ash_tablet_id_guard(param.tablet_id_.id()); - ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(ls_id_, ls_id.id()); - ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(tablet_id_, param.tablet_id_.id()); - int ret = OB_SUCCESS; - if (OB_ISNULL(result)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected iter", K(ret)); - } else { - switch (param.external_file_format_.format_type_) { - case ObExternalFileFormat::CSV_FORMAT: - case ObExternalFileFormat::PARQUET_FORMAT: - result->reset(); - break; - case ObExternalFileFormat::ORC_FORMAT: - ret = OB_NOT_SUPPORTED; - break; - case ObExternalFileFormat::ODPS_FORMAT: - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps table"); - LOG_WARN("not support to read odps in opensource", K(ret)); - break; - default: - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected format", K(ret), "format", param.external_file_format_.format_type_); - } - } - LOG_DEBUG("external table rescan", K(param.key_ranges_), K(param.range_array_pos_)); - return ret; -} - -int ObExternalTableAccessService::reuse_scan_iter(const bool switch_param, ObNewRowIterator *iter) -{ - UNUSED(switch_param); - iter->reset(); - return OB_SUCCESS; -} - -int ObExternalTableAccessService::revert_scan_iter(ObNewRowIterator *iter) -{ - int ret = OB_SUCCESS; - if (OB_ISNULL(iter)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected iter", K(ret)); - } else { - iter->~ObNewRowIterator(); - } - return ret; -} - -int ObExternalTableRowIterator::init(const ObTableScanParam *scan_param) -{ - scan_param_ = scan_param; - return init_exprs(scan_param); -} - -int ObExternalTableRowIterator::gen_ip_port(ObIAllocator &allocator) -{ - int ret = OB_SUCCESS; - char buf[MAX_IP_PORT_SQL_LENGTH]; - int32_t len = 0; - OZ (GCONF.self_addr_.addr_to_buffer(buf, MAX_IP_PORT_SQL_LENGTH, len)); - OZ (ob_write_string(allocator, ObString(len, buf), ip_port_)); - return ret; -} - -int ObExternalTableRowIterator::init_exprs(const storage::ObTableScanParam *scan_param) -{ - int ret = OB_SUCCESS; - if (OB_ISNULL(scan_param)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("scan param is null", K(ret)); - } else { - if (scan_param->column_ids_.count() != scan_param->output_exprs_->count()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("column ids not equal to access expr", K(ret)); - } - for (int i = 0; OB_SUCC(ret) && i < scan_param->column_ids_.count(); i++) { - ObExpr *cur_expr = scan_param->output_exprs_->at(i); - switch (scan_param->column_ids_.at(i)) { - case OB_HIDDEN_LINE_NUMBER_COLUMN_ID: - line_number_expr_ = cur_expr; - break; - case OB_HIDDEN_FILE_ID_COLUMN_ID: - file_id_expr_ = cur_expr; - break; - default: - OZ (column_exprs_.push_back(cur_expr)); - break; - } - } - if (OB_SUCC(ret) && column_exprs_.count() != scan_param->ext_column_convert_exprs_->count()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("column expr not equal to convert convert expr", K(ret), - K(column_exprs_), KPC(scan_param->ext_column_convert_exprs_)); - } - } - return ret; -} - -int ObExternalTableRowIterator::fill_file_partition_expr(ObExpr *expr, ObNewRow &value, const int64_t row_count) -{ - int ret = OB_SUCCESS; - ObEvalCtx &eval_ctx = scan_param_->op_->get_eval_ctx(); - ObDatum *datums = expr->locate_batch_datums(eval_ctx); - int64_t loc_idx = expr->extra_ - 1; - if (OB_UNLIKELY(loc_idx < 0 || loc_idx >= value.get_count())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("loc idx is out of range", K(loc_idx), K(value), K(ret)); - } else { - if (value.get_cell(loc_idx).is_null()) { - for (int j = 0; OB_SUCC(ret) && j < row_count; j++) { - datums[j].set_null(); - } - } else { - for (int j = 0; OB_SUCC(ret) && j < row_count; j++) { - CK (OB_NOT_NULL(datums[j].ptr_)); - OZ (datums[j].from_obj(value.get_cell(loc_idx))); - } - } - } - return ret; -} - -int ObExternalTableRowIterator::calc_file_partition_list_value(const int64_t part_id, ObIAllocator &allocator, ObNewRow &value) -{ - int ret = OB_SUCCESS; - share::schema::ObSchemaGetterGuard schema_guard; - const ObTableSchema *table_schema = NULL; - const ObPartition *partition = NULL; - ObExternalFileFormat::FormatType external_table_type; - bool is_odps_external_table = false; - if (OB_ISNULL(GCTX.schema_service_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected error"); - } else if (OB_FAIL(GCTX.schema_service_->get_tenant_schema_guard( - scan_param_->tenant_id_, - schema_guard))) { - LOG_WARN("get_schema_guard failed", K(ret)); - } else if (OB_FAIL(schema_guard.get_table_schema(scan_param_->tenant_id_, scan_param_->index_id_, table_schema))) { - LOG_WARN("get table schema failed", K(ret)); - } else if (OB_ISNULL(table_schema)) { - ret = OB_TABLE_NOT_EXIST; - LOG_WARN("table not exist", K(scan_param_->index_id_), K(scan_param_->tenant_id_)); - } else if (OB_FAIL(ObSQLUtils::is_odps_external_table(table_schema, is_odps_external_table))) { - LOG_WARN("failed to check is odps external table or not", K(ret)); - } else if (table_schema->is_partitioned_table() && (table_schema->is_user_specified_partition_for_external_table() || is_odps_external_table)) { - if (OB_FAIL(table_schema->get_partition_by_part_id(part_id, CHECK_PARTITION_MODE_NORMAL, partition))) { - LOG_WARN("get partition failed", K(ret), K(part_id)); - } else if (OB_ISNULL(partition) || OB_UNLIKELY(partition->get_list_row_values().count() != 1) - || partition->get_list_row_values().at(0).get_count() != table_schema->get_partition_key_column_num()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("partition is invalid", K(ret), K(part_id)); - } else { - int64_t pos = 0; - int64_t size = partition->get_list_row_values().at(0).get_deep_copy_size(); - char *buf = (char *)allocator.alloc(size); - if (OB_ISNULL(buf)) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("allocate mem failed", K(ret)); - } - OZ (value.deep_copy(partition->get_list_row_values().at(0), buf, size, pos)); - } - } - return ret; -} -int ObExternalTableRowIterator::calc_file_part_list_value_by_array( - const int64_t part_id, ObIAllocator &allocator, - const share::ObExternalTablePartInfoArray *partition_array, ObNewRow &value) -{ - int ret = OB_SUCCESS; - int64_t partition_index = OB_INVALID_INDEX; - share::ObExternalTablePartInfo partition; - - int64_t partition_num = partition_array->count(); - if (OB_ISNULL(partition_array) || partition_num <= 0) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid partition array", K(ret), K(part_id), K(partition_num)); - } - - for (int64_t i = 0; OB_SUCC(ret) && i < partition_num; i++) { - if (part_id == partition_array->at(i).part_id_) { - partition_index = i; - break; - } - } - - if (OB_SUCC(ret) && partition_index != OB_INVALID_INDEX) { - partition = partition_array->at(partition_index); - } - - if (OB_SUCC(ret)) { - if (partition_index == OB_INVALID_INDEX || partition.part_id_ != part_id) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid partition", K(ret), K(partition), K(part_id)); - } else { - int64_t pos = 0; - int64_t size = partition.list_row_value_.get_deep_copy_size(); - char *buf = (char *)allocator.alloc(size); - if (OB_ISNULL(buf)) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("allocate mem failed", K(ret)); - } - OZ (value.deep_copy(partition.list_row_value_, buf, size, pos)); - } - } - return ret; -} - -int ObExternalTableRowIterator::calc_exprs_for_rowid(const int64_t read_count, ObExternalIteratorState &state) -{ - int ret = OB_SUCCESS; - ObEvalCtx &eval_ctx = scan_param_->op_->get_eval_ctx(); - if (OB_NOT_NULL(file_id_expr_)) { - OZ (file_id_expr_->init_vector_for_write(eval_ctx, VEC_FIXED, read_count)); - for (int i = 0; OB_SUCC(ret) && i < read_count; i++) { - ObFixedLengthBase *vec = static_cast(file_id_expr_->get_vector(eval_ctx)); - vec->set_int(i, state.cur_file_id_); - } - file_id_expr_->set_evaluated_flag(eval_ctx); - } - if (OB_NOT_NULL(line_number_expr_)) { - OZ (line_number_expr_->init_vector_for_write(eval_ctx, VEC_FIXED, read_count)); - for (int i = 0; OB_SUCC(ret) && i < read_count; i++) { - ObFixedLengthBase *vec = static_cast(line_number_expr_->get_vector(eval_ctx)); - vec->set_int(i, state.cur_line_number_ + i); - } - line_number_expr_->set_evaluated_flag(eval_ctx); - } - state.cur_line_number_ += read_count; - state.batch_first_row_line_num_ = state.cur_line_number_ - read_count; - return ret; -} - -DEF_TO_STRING(ObExternalIteratorState) -{ - int64_t pos = 0; - J_OBJ_START(); - J_KV(K_(file_idx), - K_(part_id), - K_(cur_file_id), - K_(cur_line_number), - K_(cur_file_url), - K_(part_list_val)); - J_OBJ_END(); - return pos; -} - - -} -} + #define USING_LOG_PREFIX SQL + + #include "ob_external_table_access_service.h" + #include "share/backup/ob_backup_io_adapter.h" + #include "share/external_table/ob_external_table_utils.h" + #include "share/ob_device_manager.h" + #ifndef OB_BUILD_EMBED_MODE + #include "sql/engine/table/ob_parquet_table_row_iter.h" + #include "sql/engine/table/ob_orc_table_row_iter.h" + #endif + #include "sql/engine/cmd/ob_load_data_file_reader.h" + #include "sql/engine/table/ob_csv_table_row_iter.h" + #include "sql/engine/expr/ob_expr_regexp_context.h" + #include "share/config/ob_server_config.h" + + namespace oceanbase + { + namespace common { + extern const char *OB_STORAGE_ACCESS_TYPES_STR[]; + } + + namespace share + { + struct ObExternalTablePartInfo; + class ObExternalTablePartInfoArray; + } + using namespace share::schema; + using namespace common; + using namespace share; + namespace sql + { + + static constexpr uint64_t OB_STORAGE_ID_EXTERNAL = 2001; + + ObExternalDataAccessDriver::~ObExternalDataAccessDriver() { + close(); + if (OB_NOT_NULL(device_handle_)) { + ObDeviceManager::get_instance().release_device(device_handle_); + } + } + + void ObExternalDataAccessDriver::close() + { + if (OB_NOT_NULL(device_handle_) && fd_.is_valid()) { + int ret = OB_SUCCESS; + if (OB_FAIL(ObBackupIoAdapter::close_device_and_fd(device_handle_, fd_))) { + LOG_WARN("fail to close device and fd", KR(ret), K_(fd), KP_(device_handle)); + } + } + } + + bool ObExternalDataAccessDriver::is_opened() const + { + return fd_.is_valid(); + } + + int ObExternalDataAccessDriver::get_file_size(const ObString &url, int64_t &file_size) + { + int ret = OB_SUCCESS; + file_size = -1; + CONSUMER_GROUP_FUNC_GUARD(ObFunctionType::PRIO_IMPORT); + ObString url_cstring; + ObArenaAllocator allocator; + + if (OB_FAIL(ob_write_string(allocator, url, url_cstring, true/*c_style*/))) { + LOG_WARN("fail to copy string", KR(ret), K(url)); + } else if (OB_FAIL(ObBackupIoAdapter::get_file_length(url_cstring, access_info_, file_size))) { + LOG_WARN("fail to get file length", KR(ret), K(url_cstring), K_(access_info)); + } + + if (OB_OBJECT_NOT_EXIST == ret || OB_IO_ERROR == ret) { + file_size = -1; + ret = OB_SUCCESS; + } + return ret; + } + + int ObExternalDataAccessDriver::open(const char *url) + { + int ret = OB_SUCCESS; + if (OB_UNLIKELY(is_opened())) { + ret = OB_INIT_TWICE; + LOG_WARN("Data Access Driver has been opened", KR(ret), K(url)); + } else if (OB_FAIL(ObBackupIoAdapter::open_with_access_type( + device_handle_, fd_, access_info_, url, OB_STORAGE_ACCESS_READER, + ObStorageIdMod(OB_STORAGE_ID_EXTERNAL, ObStorageUsedMod::STORAGE_USED_EXTERNAL)))) { + LOG_WARN("fail to open Data Access Driver", KR(ret), K_(access_info), K(url)); + } + return ret; + } + + int ObExternalDataAccessDriver::pread(void *buf, const int64_t count, const int64_t offset, int64_t &read_size) + { + int ret = OB_SUCCESS; + ObIOHandle io_handle; + CONSUMER_GROUP_FUNC_GUARD(PRIO_IMPORT); + if (OB_FAIL(ObBackupIoAdapter::async_pread(*device_handle_, fd_, + static_cast(buf), offset, count, io_handle))) { + LOG_WARN("fail to async pread", KR(ret), + KP_(device_handle), K_(fd), KP(buf), K(offset), K(count)); + } else if (OB_FAIL(io_handle.wait())) { + LOG_WARN("fail to wait pread result", KR(ret), + KP_(device_handle), K_(fd), KP(buf), K(offset), K(count)); + } else { + read_size = io_handle.get_data_size(); + } + return ret; + } + + class ObExternalFileListArrayOpWithFilter : public ObBaseDirEntryOperator + { + public: + ObExternalFileListArrayOpWithFilter(ObIArray & name_array, + ObIArray & file_size, + ObExternalPathFilter *filter, + ObIAllocator& array_allocator) + : name_array_(name_array), file_size_(file_size), filter_(filter), allocator_(array_allocator) {} + + virtual bool need_get_file_size() const override { return true; } + int func(const dirent *entry) { + int ret = OB_SUCCESS; + if (OB_ISNULL(entry)) { + ret = OB_INVALID_ARGUMENT; + OB_LOG(WARN, "invalid list entry, entry is null"); + } else if (OB_ISNULL(entry->d_name)) { + ret = OB_INVALID_ARGUMENT; + OB_LOG(WARN, "invalid list entry, d_name is null"); + } else { + const ObString file_name(entry->d_name); + ObString tmp_file; + bool is_filtered = false; + if (!file_name.empty() && file_name[file_name.length() - 1] != '/') { + if (OB_NOT_NULL(filter_) && OB_FAIL(filter_->is_filtered(file_name, is_filtered))) { + LOG_WARN("fail check is filtered", K(ret)); + } else if (!is_filtered) { + if (OB_FAIL(ob_write_string(allocator_, file_name, tmp_file, true))) { + OB_LOG(WARN, "fail to save file name", K(ret), K(file_name)); + } else if (OB_FAIL(name_array_.push_back(tmp_file))) { + OB_LOG(WARN, "fail to push filename to array", K(ret), K(tmp_file)); + } else if (OB_FAIL(file_size_.push_back(get_size()))) { + OB_LOG(WARN, "fail to push size to array", K(ret), K(tmp_file)); + } + } + } + } + return ret; + } + + private: + ObIArray & name_array_; + ObIArray & file_size_; + ObExternalPathFilter *filter_; + ObIAllocator& allocator_; + }; + + class ObLocalFileListArrayOpWithFilter : public ObBaseDirEntryOperator + { + public: + ObLocalFileListArrayOpWithFilter(ObIArray &name_array, + ObIArray & file_size, + const ObString &path, + const ObString &origin_path, + ObExternalPathFilter *filter, + ObIAllocator &array_allocator) + : name_array_(name_array), file_size_(file_size), path_(path), origin_path_(origin_path), + filter_(filter), allocator_(array_allocator) {} + virtual bool need_get_file_size() const override { return true; } + int func(const dirent *entry) + { + int ret = OB_SUCCESS; + if (OB_ISNULL(entry)) { + ret = OB_INVALID_ARGUMENT; + OB_LOG(WARN, "invalid list entry, entry is null"); + } else if (OB_ISNULL(entry->d_name)) { + ret = OB_INVALID_ARGUMENT; + OB_LOG(WARN, "invalid list entry, d_name is null"); + } else { + const ObString file_name(entry->d_name); + ObSqlString full_path; + ObString tmp_file; + bool is_filtered = false; + ObString cur_path = path_; + if (file_name.case_compare(".") == 0 + || file_name.case_compare("..") == 0) { + //do nothing + } else if (OB_FAIL(full_path.assign(cur_path))) { + OB_LOG(WARN, "assign string failed", K(ret)); + } else if (full_path.length() > 0 && *(full_path.ptr() + full_path.length() - 1) != '/' && + OB_FAIL(full_path.append("/"))) { + OB_LOG(WARN, "append failed", K(ret)) ; + } else if (OB_FAIL(full_path.append(file_name))) { + OB_LOG(WARN, "append file name failed", K(ret)); + } else if (OB_NOT_NULL(filter_) && OB_FAIL(filter_->is_filtered(full_path.string(), is_filtered))) { + LOG_WARN("fail check is filtered", K(ret)); + } else if (!is_filtered) { + ObString target = full_path.string(); + if (!is_dir_scan()) { + target += origin_path_.length(); + if (!target.empty() && '/' == target[0]) { + target += 1; + } + } + if (target.empty()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("empty dir or name", K(full_path), K(origin_path_)); + } else if (OB_FAIL(ob_write_string(allocator_, target, tmp_file, true/*c_style*/))) { + OB_LOG(WARN, "fail to save file name", K(ret), K(file_name)); + } else if (OB_FAIL(name_array_.push_back(tmp_file))) { + OB_LOG(WARN, "fail to push filename to array", K(ret), K(tmp_file)); + } else if (OB_FAIL(file_size_.push_back(get_size()))) { + OB_LOG(WARN, "fail to push size to array", K(ret), K(tmp_file)); + } + } + } + return ret; + } + private: + ObIArray &name_array_; + ObIArray &file_size_; + const ObString &path_; + const ObString &origin_path_; + ObExternalPathFilter *filter_; + ObIAllocator &allocator_; + }; + + + int ObExternalDataAccessDriver::get_file_list(const ObString &path, + const ObString &pattern, + const ObExprRegexpSessionVariables ®exp_vars, + ObIArray &file_urls, + ObIArray &file_sizes, + ObIAllocator &allocator) + { + int ret = OB_SUCCESS; + const int64_t MAX_VISIT_COUNT = 100000; + ObExprRegexContext regexp_ctx; + ObExternalPathFilter filter(regexp_ctx, allocator); + ObString path_cstring; + CONSUMER_GROUP_FUNC_GUARD(PRIO_IMPORT); + + if (OB_UNLIKELY(!access_info_->is_valid())) { + ret = OB_NOT_INIT; + LOG_WARN("ObExternalDataAccessDriver not init", KR(ret), K_(access_info)); + } else if (!pattern.empty() && OB_FAIL(filter.init(pattern, regexp_vars))) { + LOG_WARN("fail to init filter", K(ret)); + } else if (OB_FAIL(ob_write_string(allocator, path, path_cstring, true/*c_style*/))) { + LOG_WARN("fail to copy string", KR(ret), K(path)); + } else if (get_storage_type() == OB_STORAGE_FILE) { + ObSEArray file_dirs; + bool is_dir = false; + + if (get_storage_type() == OB_STORAGE_FILE) { + ObString path_without_prifix; + path_without_prifix = path_cstring; + path_without_prifix += strlen(OB_FILE_PREFIX); + + OZ(FileDirectoryUtils::is_directory(path_without_prifix.ptr(), is_dir)); + if (!is_dir) { + LOG_INFO("external location is not a directory", + K(path_without_prifix)); + } else { + OZ(file_dirs.push_back(path_cstring)); + } + } + + ObArray useless_size; + for (int64_t i = 0; OB_SUCC(ret) && i < file_dirs.count(); i++) { + ObString file_dir = file_dirs.at(i); + ObLocalFileListArrayOpWithFilter dir_op(file_dirs, useless_size, file_dir, path_cstring, NULL, allocator); + ObLocalFileListArrayOpWithFilter file_op(file_urls, file_sizes, file_dir, path_cstring, + pattern.empty() ? NULL : &filter, allocator); + dir_op.set_dir_flag(); + if (OB_FAIL(ObBackupIoAdapter::list_files(file_dir, access_info_, file_op))) { + LOG_WARN("fail to list files", KR(ret), K(file_dir), K_(access_info)); + } else if (OB_FAIL(ObBackupIoAdapter::list_directories(file_dir, access_info_, dir_op))) { + LOG_WARN("fail to list dirs", KR(ret), K(file_dir), K_(access_info)); + } else if (file_dirs.count() + file_urls.count() > MAX_VISIT_COUNT) { + ret = OB_SIZE_OVERFLOW; + LOG_WARN("too many files and dirs to visit", K(ret)); + } + } + } else { + ObExternalFileListArrayOpWithFilter file_op(file_urls, file_sizes, pattern.empty() ? NULL : &filter, allocator); + if (OB_FAIL(ObBackupIoAdapter::list_files(path_cstring, access_info_, file_op))) { + LOG_WARN("fail to list files", KR(ret), K(path_cstring), K_(access_info)); + } + } + return ret; + } + + int ObExternalDataAccessDriver::init(const ObString &location, const ObString &access_info) + { + int ret = OB_SUCCESS; + ObStorageType device_type = OB_STORAGE_MAX_TYPE; + ObArenaAllocator temp_allocator; + ObString location_cstr; + ObString access_info_cstr; + ObBackupIoAdapter util; + + if (OB_FAIL(get_storage_type_from_path(location, device_type))) { + LOG_WARN("fail to resove storage type", K(ret)); + } else { + storage_type_ = device_type; + // Note: if device type is file, the storage info is empty. + if (device_type == OB_STORAGE_FILE || + (OB_ISNULL(access_info) || OB_LIKELY(0 == access_info.length()))) { + OZ(ob_write_string(temp_allocator, location, location_cstr, true)); + access_info_cstr.assign_ptr(&dummy_empty_char, static_cast(strlen(&dummy_empty_char))); + } else { + OZ (ob_write_string(temp_allocator, location, location_cstr, true)); + OZ (ob_write_string(temp_allocator, access_info, access_info_cstr, true)); + } + } + access_info_ = &backup_storage_info_; + if (OB_ISNULL(access_info_)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("failed to get access into", K(ret), K(device_type), K(access_info_cstr)); + } + LOG_TRACE("resolve storage into", K(ret), K(device_type), K(access_info_cstr)); + OZ (access_info_->set(device_type, access_info_cstr.ptr())); + + return ret; + } + + ObExternalStreamFileReader::~ObExternalStreamFileReader() + { + reset(); + } + + const char * ObExternalStreamFileReader::MEMORY_LABEL = "ExternalReader"; + const int64_t ObExternalStreamFileReader::COMPRESSED_DATA_BUFFER_SIZE = 2 * 1024 * 1024; + + int ObExternalStreamFileReader::init(const common::ObString &location, + const ObString &access_info, + ObCSVGeneralFormat::ObCSVCompression compression_format, + ObIAllocator &allocator) + { + int ret = OB_SUCCESS; + if (OB_NOT_NULL(allocator_)) { + ret = OB_INIT_TWICE; + } else if (OB_FAIL(data_access_driver_.init(location, access_info))) { + LOG_WARN("failed to init data access driver", K(ret), K(location), K(access_info)); + } else { + allocator_ = &allocator; + compression_format_ = compression_format; + } + return ret; + } + + int ObExternalStreamFileReader::open(const ObString &filename) + { + int ret = OB_SUCCESS; + if (OB_ISNULL(allocator_)) { + ret = OB_NOT_INIT; + } else if (data_access_driver_.is_opened()) { + ret = OB_INIT_TWICE; + } else if (OB_FAIL(data_access_driver_.open(filename.ptr()))) { + LOG_WARN("failed to open file", K(ret), K(filename)); + } else if (OB_FAIL(data_access_driver_.get_file_size(filename.ptr(), file_size_))) { + LOG_WARN("failed to get file size", K(ret), K(filename)); + } else { + is_file_end_ = false; + + ObCSVGeneralFormat::ObCSVCompression this_file_compression_format = compression_format_; + if (this_file_compression_format == ObCSVGeneralFormat::ObCSVCompression::AUTO + && OB_FAIL(compression_algorithm_from_suffix(filename, this_file_compression_format))) { + LOG_WARN("failed to dectect compression format from filename", K(ret), K(filename)); + } + + if (OB_SUCC(ret) && OB_FAIL(create_decompressor(this_file_compression_format))) { + LOG_WARN("failed to create decompressor", K(ret)); + } + } + + LOG_TRACE("open file done", K(filename), K(ret)); + return ret; + } + + void ObExternalStreamFileReader::close() + { + if (data_access_driver_.is_opened()) { + data_access_driver_.close(); + + is_file_end_ = true; + file_offset_ = 0; + file_size_ = 0; + LOG_DEBUG("close file"); + } + } + + void ObExternalStreamFileReader::reset() + { + close(); + if (OB_NOT_NULL(compressed_data_) && OB_NOT_NULL(allocator_)) { + allocator_->free(compressed_data_); + compressed_data_ = nullptr; + } + + if (OB_NOT_NULL(decompressor_)) { + ObDecompressor::destroy(decompressor_); + decompressor_ = nullptr; + } + + allocator_ = nullptr; + } + + bool ObExternalStreamFileReader::eof() + { + return is_file_end_; + } + + int ObExternalStreamFileReader::read(char *buf, int64_t buf_len, int64_t &read_size) + { + int ret = OB_SUCCESS; + read_size = 0; + + if (OB_ISNULL(buf) || buf_len <= 0) { + ret = OB_INVALID_ARGUMENT; + } else if (OB_ISNULL(decompressor_)) { + ret = read_from_driver(buf, buf_len, read_size); + is_file_end_ = file_offset_ >= file_size_; + LOG_DEBUG("read file", K(is_file_end_), K(file_offset_), K(file_size_), K(read_size)); + } else { + ret = read_decompress(buf, buf_len, read_size); + is_file_end_ = (file_offset_ >= file_size_) && (consumed_data_size_ >= compress_data_size_); + } + return ret; + } + + int ObExternalStreamFileReader::read_from_driver(char *buf, int64_t buf_len, int64_t &read_size) + { + int ret = OB_SUCCESS; + read_size = 0; + + if (OB_ISNULL(buf) || buf_len <= 0) { + ret = OB_INVALID_ARGUMENT; + } else if(OB_FAIL(data_access_driver_.pread(buf, buf_len, file_offset_, read_size))) { + LOG_WARN("failed to read data from data access driver", K(ret), K(file_offset_), K(buf_len)); + } else { + file_offset_ += read_size; + } + return ret; + } + + int ObExternalStreamFileReader::read_decompress(char *buf, int64_t buf_len, int64_t &read_size) + { + int ret = OB_SUCCESS; + read_size = 0; + + if (!data_access_driver_.is_opened()) { + ret = OB_NOT_INIT; + } else if (OB_ISNULL(buf) || buf_len <= 0) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KP(buf), K(buf_len)); + } else if (consumed_data_size_ >= compress_data_size_) { + if (file_offset_ < file_size_) { + ret = read_compressed_data(); + } else { + is_file_end_ = true; + } + } + + if (OB_SUCC(ret) && compress_data_size_ > consumed_data_size_) { + int64_t consumed_size = 0; + ret = decompressor_->decompress(compressed_data_ + consumed_data_size_, + compress_data_size_ - consumed_data_size_, + consumed_size, + buf, + buf_len, + read_size); + if (OB_FAIL(ret)) { + LOG_WARN("failed to decompress", K(ret)); + } else { + consumed_data_size_ += consumed_size; + uncompressed_size_ += read_size; + } + } + return ret; + } + + int ObExternalStreamFileReader::read_compressed_data() + { + int ret = OB_SUCCESS; + char *read_buffer = compressed_data_; + if (!data_access_driver_.is_opened()) { + ret = OB_NOT_INIT; + } else if (OB_UNLIKELY(consumed_data_size_ < compress_data_size_)) { + // backup data + const int64_t last_data_size = compress_data_size_ - consumed_data_size_; + MEMMOVE(compressed_data_, compressed_data_ + consumed_data_size_, last_data_size); + read_buffer = compressed_data_ + last_data_size; + consumed_data_size_ = 0; + compress_data_size_ = last_data_size; + } else if (consumed_data_size_ == compress_data_size_) { + consumed_data_size_ = 0; + compress_data_size_ = 0; + } + + if (OB_SUCC(ret)) { + // read data from source reader + int64_t read_size = 0; + int64_t capacity = COMPRESSED_DATA_BUFFER_SIZE - compress_data_size_; + ret = read_from_driver(read_buffer, capacity, read_size); + if (OB_SUCC(ret)) { + compress_data_size_ += read_size; + } + } + return ret; + } + + int ObExternalStreamFileReader::create_decompressor(ObCSVGeneralFormat::ObCSVCompression compression_format) + { + int ret = OB_SUCCESS; + if (OB_ISNULL(allocator_)) { + ret = OB_NOT_INIT; + } else if (compression_format == ObCSVGeneralFormat::ObCSVCompression::NONE) { + ObDecompressor::destroy(decompressor_); + decompressor_ = nullptr; + } else if (OB_NOT_NULL(decompressor_) && decompressor_->compression_format() == compression_format) { + // do nothing + } else { + if (OB_NOT_NULL(decompressor_)) { + ObDecompressor::destroy(decompressor_); + decompressor_ = nullptr; + } + + if (OB_FAIL(ObDecompressor::create(compression_format, *allocator_, decompressor_))) { + LOG_WARN("failed to create decompressor", K(ret)); + } else if (OB_ISNULL(compressed_data_) && + OB_ISNULL(compressed_data_ = (char *)allocator_->alloc(COMPRESSED_DATA_BUFFER_SIZE))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate memory", K(COMPRESSED_DATA_BUFFER_SIZE)); + } + } + return ret; + } + + int ObExternalTableAccessService::table_scan( + ObVTableScanParam ¶m, + ObNewRowIterator *&result) + { + ACTIVE_SESSION_FLAG_SETTER_GUARD(in_storage_read); + const share::ObLSID &ls_id = param.ls_id_; + common::ObASHTabletIdSetterGuard ash_tablet_id_guard(param.tablet_id_.id()); + ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(ls_id_, ls_id.id()); + ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(tablet_id_, param.tablet_id_.id()); + int ret = OB_SUCCESS; + ObExternalTableRowIterator* row_iter = NULL; + + auto &scan_param = static_cast(param); + + switch (param.external_file_format_.format_type_) { + case ObExternalFileFormat::CSV_FORMAT: + if (OB_ISNULL(row_iter = OB_NEWx(ObCSVTableRowIterator, (scan_param.allocator_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(ret)); + } + break; + case ObExternalFileFormat::PARQUET_FORMAT: + #ifdef OB_BUILD_EMBED_MODE + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet not supported in embed mode", K(ret)); + #else + if (OB_ISNULL(row_iter = OB_NEWx(ObParquetTableRowIterator, (scan_param.allocator_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(ret)); + } + #endif + break; + case ObExternalFileFormat::ODPS_FORMAT: + if (!GCONF._use_odps_jni_connector) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps cpp connector is not enabled", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps jni connector is not enabled", K(ret)); + } + break; + case ObExternalFileFormat::ORC_FORMAT: + ret = OB_NOT_SUPPORTED; + break; + default: + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected format", K(ret), "format", param.external_file_format_.format_type_); + } + + if (OB_SUCC(ret)) { + if (OB_FAIL(row_iter->init(&scan_param))) { + row_iter->~ObExternalTableRowIterator(); + LOG_WARN("fail to open iter", K(ret)); + } else { + result = row_iter; + } + } + + LOG_DEBUG("external table access service iter init", K(ret), "type", param.external_file_format_.format_type_); + + return ret; + } + + int ObExternalTableAccessService::table_rescan(ObVTableScanParam ¶m, ObNewRowIterator *result) + { + ACTIVE_SESSION_FLAG_SETTER_GUARD(in_storage_read); + const share::ObLSID &ls_id = param.ls_id_; + common::ObASHTabletIdSetterGuard ash_tablet_id_guard(param.tablet_id_.id()); + ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(ls_id_, ls_id.id()); + ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(tablet_id_, param.tablet_id_.id()); + int ret = OB_SUCCESS; + if (OB_ISNULL(result)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected iter", K(ret)); + } else { + switch (param.external_file_format_.format_type_) { + case ObExternalFileFormat::CSV_FORMAT: + case ObExternalFileFormat::PARQUET_FORMAT: + result->reset(); + break; + case ObExternalFileFormat::ORC_FORMAT: + ret = OB_NOT_SUPPORTED; + break; + case ObExternalFileFormat::ODPS_FORMAT: + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps table"); + LOG_WARN("not support to read odps in opensource", K(ret)); + break; + default: + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected format", K(ret), "format", param.external_file_format_.format_type_); + } + } + LOG_DEBUG("external table rescan", K(param.key_ranges_), K(param.range_array_pos_)); + return ret; + } + + int ObExternalTableAccessService::reuse_scan_iter(const bool switch_param, ObNewRowIterator *iter) + { + UNUSED(switch_param); + iter->reset(); + return OB_SUCCESS; + } + + int ObExternalTableAccessService::revert_scan_iter(ObNewRowIterator *iter) + { + int ret = OB_SUCCESS; + if (OB_ISNULL(iter)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected iter", K(ret)); + } else { + iter->~ObNewRowIterator(); + } + return ret; + } + + int ObExternalTableRowIterator::init(const ObTableScanParam *scan_param) + { + scan_param_ = scan_param; + return init_exprs(scan_param); + } + + int ObExternalTableRowIterator::gen_ip_port(ObIAllocator &allocator) + { + int ret = OB_SUCCESS; + char buf[MAX_IP_PORT_SQL_LENGTH]; + int32_t len = 0; + OZ (GCONF.self_addr_.addr_to_buffer(buf, MAX_IP_PORT_SQL_LENGTH, len)); + OZ (ob_write_string(allocator, ObString(len, buf), ip_port_)); + return ret; + } + + int ObExternalTableRowIterator::init_exprs(const storage::ObTableScanParam *scan_param) + { + int ret = OB_SUCCESS; + if (OB_ISNULL(scan_param)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("scan param is null", K(ret)); + } else { + if (scan_param->column_ids_.count() != scan_param->output_exprs_->count()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("column ids not equal to access expr", K(ret)); + } + for (int i = 0; OB_SUCC(ret) && i < scan_param->column_ids_.count(); i++) { + ObExpr *cur_expr = scan_param->output_exprs_->at(i); + switch (scan_param->column_ids_.at(i)) { + case OB_HIDDEN_LINE_NUMBER_COLUMN_ID: + line_number_expr_ = cur_expr; + break; + case OB_HIDDEN_FILE_ID_COLUMN_ID: + file_id_expr_ = cur_expr; + break; + default: + OZ (column_exprs_.push_back(cur_expr)); + break; + } + } + if (OB_SUCC(ret) && column_exprs_.count() != scan_param->ext_column_convert_exprs_->count()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("column expr not equal to convert convert expr", K(ret), + K(column_exprs_), KPC(scan_param->ext_column_convert_exprs_)); + } + } + return ret; + } + + int ObExternalTableRowIterator::fill_file_partition_expr(ObExpr *expr, ObNewRow &value, const int64_t row_count) + { + int ret = OB_SUCCESS; + ObEvalCtx &eval_ctx = scan_param_->op_->get_eval_ctx(); + ObDatum *datums = expr->locate_batch_datums(eval_ctx); + int64_t loc_idx = expr->extra_ - 1; + if (OB_UNLIKELY(loc_idx < 0 || loc_idx >= value.get_count())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("loc idx is out of range", K(loc_idx), K(value), K(ret)); + } else { + if (value.get_cell(loc_idx).is_null()) { + for (int j = 0; OB_SUCC(ret) && j < row_count; j++) { + datums[j].set_null(); + } + } else { + for (int j = 0; OB_SUCC(ret) && j < row_count; j++) { + CK (OB_NOT_NULL(datums[j].ptr_)); + OZ (datums[j].from_obj(value.get_cell(loc_idx))); + } + } + } + return ret; + } + + int ObExternalTableRowIterator::calc_file_partition_list_value(const int64_t part_id, ObIAllocator &allocator, ObNewRow &value) + { + int ret = OB_SUCCESS; + share::schema::ObSchemaGetterGuard schema_guard; + const ObTableSchema *table_schema = NULL; + const ObPartition *partition = NULL; + ObExternalFileFormat::FormatType external_table_type; + bool is_odps_external_table = false; + if (OB_ISNULL(GCTX.schema_service_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected error"); + } else if (OB_FAIL(GCTX.schema_service_->get_tenant_schema_guard( + scan_param_->tenant_id_, + schema_guard))) { + LOG_WARN("get_schema_guard failed", K(ret)); + } else if (OB_FAIL(schema_guard.get_table_schema(scan_param_->tenant_id_, scan_param_->index_id_, table_schema))) { + LOG_WARN("get table schema failed", K(ret)); + } else if (OB_ISNULL(table_schema)) { + ret = OB_TABLE_NOT_EXIST; + LOG_WARN("table not exist", K(scan_param_->index_id_), K(scan_param_->tenant_id_)); + } else if (OB_FAIL(ObSQLUtils::is_odps_external_table(table_schema, is_odps_external_table))) { + LOG_WARN("failed to check is odps external table or not", K(ret)); + } else if (table_schema->is_partitioned_table() && (table_schema->is_user_specified_partition_for_external_table() || is_odps_external_table)) { + if (OB_FAIL(table_schema->get_partition_by_part_id(part_id, CHECK_PARTITION_MODE_NORMAL, partition))) { + LOG_WARN("get partition failed", K(ret), K(part_id)); + } else if (OB_ISNULL(partition) || OB_UNLIKELY(partition->get_list_row_values().count() != 1) + || partition->get_list_row_values().at(0).get_count() != table_schema->get_partition_key_column_num()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("partition is invalid", K(ret), K(part_id)); + } else { + int64_t pos = 0; + int64_t size = partition->get_list_row_values().at(0).get_deep_copy_size(); + char *buf = (char *)allocator.alloc(size); + if (OB_ISNULL(buf)) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("allocate mem failed", K(ret)); + } + OZ (value.deep_copy(partition->get_list_row_values().at(0), buf, size, pos)); + } + } + return ret; + } + int ObExternalTableRowIterator::calc_file_part_list_value_by_array( + const int64_t part_id, ObIAllocator &allocator, + const share::ObExternalTablePartInfoArray *partition_array, ObNewRow &value) + { + int ret = OB_SUCCESS; + int64_t partition_index = OB_INVALID_INDEX; + share::ObExternalTablePartInfo partition; + + int64_t partition_num = partition_array->count(); + if (OB_ISNULL(partition_array) || partition_num <= 0) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid partition array", K(ret), K(part_id), K(partition_num)); + } + + for (int64_t i = 0; OB_SUCC(ret) && i < partition_num; i++) { + if (part_id == partition_array->at(i).part_id_) { + partition_index = i; + break; + } + } + + if (OB_SUCC(ret) && partition_index != OB_INVALID_INDEX) { + partition = partition_array->at(partition_index); + } + + if (OB_SUCC(ret)) { + if (partition_index == OB_INVALID_INDEX || partition.part_id_ != part_id) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid partition", K(ret), K(partition), K(part_id)); + } else { + int64_t pos = 0; + int64_t size = partition.list_row_value_.get_deep_copy_size(); + char *buf = (char *)allocator.alloc(size); + if (OB_ISNULL(buf)) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("allocate mem failed", K(ret)); + } + OZ (value.deep_copy(partition.list_row_value_, buf, size, pos)); + } + } + return ret; + } + + int ObExternalTableRowIterator::calc_exprs_for_rowid(const int64_t read_count, ObExternalIteratorState &state) + { + int ret = OB_SUCCESS; + ObEvalCtx &eval_ctx = scan_param_->op_->get_eval_ctx(); + if (OB_NOT_NULL(file_id_expr_)) { + OZ (file_id_expr_->init_vector_for_write(eval_ctx, VEC_FIXED, read_count)); + for (int i = 0; OB_SUCC(ret) && i < read_count; i++) { + ObFixedLengthBase *vec = static_cast(file_id_expr_->get_vector(eval_ctx)); + vec->set_int(i, state.cur_file_id_); + } + file_id_expr_->set_evaluated_flag(eval_ctx); + } + if (OB_NOT_NULL(line_number_expr_)) { + OZ (line_number_expr_->init_vector_for_write(eval_ctx, VEC_FIXED, read_count)); + for (int i = 0; OB_SUCC(ret) && i < read_count; i++) { + ObFixedLengthBase *vec = static_cast(line_number_expr_->get_vector(eval_ctx)); + vec->set_int(i, state.cur_line_number_ + i); + } + line_number_expr_->set_evaluated_flag(eval_ctx); + } + state.cur_line_number_ += read_count; + state.batch_first_row_line_num_ = state.cur_line_number_ - read_count; + return ret; + } + + DEF_TO_STRING(ObExternalIteratorState) + { + int64_t pos = 0; + J_OBJ_START(); + J_KV(K_(file_idx), + K_(part_id), + K_(cur_file_id), + K_(cur_line_number), + K_(cur_file_url), + K_(part_list_val)); + J_OBJ_END(); + return pos; + } + + + } + } + \ No newline at end of file From e0d2d77aac61f27eeb5b23b7258b2560e5214a06 Mon Sep 17 00:00:00 2001 From: hnwyllmm Date: Wed, 20 May 2026 11:23:26 +0800 Subject: [PATCH 2/6] fixed tab --- src/sql/engine/basic/ob_select_into_op.cpp | 4807 ++++++++--------- .../ob_external_table_access_service.cpp | 1739 +++--- 2 files changed, 3272 insertions(+), 3274 deletions(-) diff --git a/src/sql/engine/basic/ob_select_into_op.cpp b/src/sql/engine/basic/ob_select_into_op.cpp index 293b8cf29..20295c9ee 100644 --- a/src/sql/engine/basic/ob_select_into_op.cpp +++ b/src/sql/engine/basic/ob_select_into_op.cpp @@ -1,2412 +1,2411 @@ /* - * Copyright (c) 2025 OceanBase. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - #define USING_LOG_PREFIX SQL_ENG - - #include - #include - - #include "ob_select_into_op.h" - #include "sql/engine/cmd/ob_variable_set_executor.h" - #include "lib/charset/ob_charset_string_helper.h" - #include "sql/engine/px/ob_px_sqc_handler.h" - #include "sql/engine/expr/ob_expr_json_func_helper.h" - #include "lib/udt/ob_collection_type.h" - #include "share/config/ob_server_config.h" - - #ifndef OB_BUILD_EMBED_MODE - #include - #include - #include - #include - #include - #include - #include - - #define ARROW_FAIL(statement) (OB_UNLIKELY(!(statement).ok())) - - #endif - - namespace oceanbase - { - using namespace common; - namespace sql - { - - OB_SERIALIZE_MEMBER(ObSelectIntoOpInput, task_id_, sqc_id_); - OB_SERIALIZE_MEMBER((ObSelectIntoSpec, ObOpSpec), into_type_, user_vars_, outfile_name_, - field_str_, // FARM COMPAT WHITELIST FOR filed_str_: renamed - line_str_, closed_cht_, is_optional_, select_exprs_, is_single_, max_file_size_, - escaped_cht_, cs_type_, parallel_, file_partition_expr_, buffer_size_, is_overwrite_, - external_properties_, external_partition_, alias_names_); - - - int ObSelectIntoOp::inner_open() - { - int ret = OB_SUCCESS; - ObSQLSessionInfo *session = NULL; - if (OB_ISNULL(session = ctx_.get_my_session())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get session failed", K(ret)); - } else { - // since we call get_next_row in inner_open, we have to set opened_ first in avoid to a infinite loop. - opened_ = true; - if (OB_FAIL(session->get_sql_select_limit(top_limit_cnt_))) { - LOG_WARN("fail tp get sql select limit", K(ret)); - } - } - if (OB_SUCC(ret) && !MY_SPEC.external_properties_.str_.empty()) { - if (OB_FAIL(external_properties_.load_from_string(MY_SPEC.external_properties_.str_, - ctx_.get_allocator()))) { - LOG_WARN("failed to load external properties", K(ret)); - } else { - format_type_ = external_properties_.format_type_; - } - } - if (OB_SUCC(ret)) { - switch (format_type_) - { - case ObExternalFileFormat::FormatType::CSV_FORMAT: - { - if (OB_FAIL(init_csv_env())) { - LOG_WARN("failed to init csv env", K(ret)); - } - break; - } - case ObExternalFileFormat::FormatType::ODPS_FORMAT: - { - if (!GCONF._use_odps_jni_connector) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support odps format", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support odps format", K(ret)); - } - break; - } - case ObExternalFileFormat::FormatType::PARQUET_FORMAT: - { - #ifndef OB_BUILD_EMBED_MODE - if (OB_FAIL(init_parquet_env())) { - LOG_WARN("failed to init parquet env", K(ret)); - } - #endif - break; - } - case ObExternalFileFormat::FormatType::ORC_FORMAT: - { - ret = OB_NOT_SUPPORTED; - break; - } - default: - { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support select into type", K(format_type_)); - } - } - } - return ret; - } - - int ObSelectIntoOp::init_csv_env() - { - int ret = OB_SUCCESS; - ObSQLSessionInfo *session = NULL; - set_csv_format_options(); - if (OB_ISNULL(session = ctx_.get_my_session())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get session failed", K(ret)); - } else if (OB_FAIL(init_env_common())) { - LOG_WARN("failed to init env common", K(ret)); - } else if (OB_FAIL(prepare_escape_printer())) { - LOG_WARN("failed to calc escape info", K(ret)); - } else { - if (external_properties_.csv_format_.compression_algorithm_ != CsvCompressType::NONE) { - has_compress_ = true; - } - // setup binary output format for bit/binary - switch (external_properties_.csv_format_.binary_format_) { - case ObCSVGeneralFormat::ObCSVBinaryFormat::DEFAULT: - print_params_.binary_string_print_hex_ = false; - break; - case ObCSVGeneralFormat::ObCSVBinaryFormat::HEX: - print_params_.binary_string_print_hex_ = true; - break; - case ObCSVGeneralFormat::ObCSVBinaryFormat::BASE64: - print_params_.binary_string_print_base64_ = true; - break; - default: - ret = OB_ERR_UNEXPECTED; - LOG_WARN("failed to set csv binary output format", K(ret)); - } - print_params_.tz_info_ = session->get_timezone_info(); - print_params_.use_memcpy_ = true; - print_params_.cs_type_ = cs_type_; - } - //create buffer - if (OB_SUCC(ret) && T_INTO_OUTFILE == MY_SPEC.into_type_ && OB_FAIL(create_shared_buffer_for_data_writer())) { - LOG_WARN("failed to create buffer for data writer", K(ret)); - } - return ret; - } - - void ObSelectIntoOp::set_csv_format_options() - { - if (MY_SPEC.external_properties_.str_.empty()) { - field_str_ = MY_SPEC.field_str_; - line_str_ = MY_SPEC.line_str_; - has_enclose_ = MY_SPEC.closed_cht_.get_val_len() > 0; - char_enclose_ = has_enclose_ ? MY_SPEC.closed_cht_.get_char().ptr()[0] : 0; - is_optional_ = MY_SPEC.is_optional_; - has_escape_ = MY_SPEC.escaped_cht_.get_val_len() > 0; - char_escape_ = has_escape_ ? MY_SPEC.escaped_cht_.get_char().ptr()[0] : 0; - cs_type_ = MY_SPEC.cs_type_; - } else { - is_optional_ = external_properties_.csv_format_.is_optional_; - cs_type_ = ObCharset::get_default_collation(external_properties_.csv_format_.cs_type_); - field_str_.set_varchar(external_properties_.csv_format_.field_term_str_); - field_str_.set_collation_type(cs_type_); - line_str_.set_varchar(external_properties_.csv_format_.line_term_str_); - line_str_.set_collation_type(cs_type_); - if (external_properties_.csv_format_.field_enclosed_char_ == INT64_MAX) { // null - has_enclose_ = false; - char_enclose_ = 0; - } else { - has_enclose_ = true; - char_enclose_ = external_properties_.csv_format_.field_enclosed_char_; - } - if (external_properties_.csv_format_.field_escaped_char_ == INT64_MAX) { // null - has_escape_ = false; - char_escape_ = 0; - } else { - has_escape_ = true; - char_escape_ = external_properties_.csv_format_.field_escaped_char_; - } - } - } - - int ObSelectIntoOp::init_env_common() - { - int ret = OB_SUCCESS; - ObPhysicalPlanCtx *phy_plan_ctx = NULL; - bool need_check = false; - file_name_ = MY_SPEC.outfile_name_; - do_partition_ = MY_SPEC.file_partition_expr_ == NULL ? false : true; - if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get phy_plan_ctx failed", K(ret)); - } else if (OB_FAIL(ObSQLUtils::get_param_value(MY_SPEC.outfile_name_, - phy_plan_ctx->get_param_store(), - file_name_, - need_check))) { - LOG_WARN("get param value failed", K(ret)); - } else if (OB_FAIL(calc_url_and_set_access_info())) { - LOG_WARN("failed to calc basic url and set device handle", K(ret)); - } else if (OB_FAIL(check_has_lob_or_json())) { - LOG_WARN("failed to check has lob", K(ret)); - } else if (has_coll_ && MY_SPEC.into_type_ == T_INTO_VARIABLES) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "select array/map into variables"); - } else if (do_partition_ - && OB_FAIL(partition_map_.create(128, ObLabel("SelectInto"), ObLabel("SelectInto"), MTL_ID()))) { - LOG_WARN("failed to create hashmap", K(ret)); - } else if (MY_SPEC.select_exprs_.count() != MY_SPEC.alias_names_.strs_.count()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected column count", K(MY_SPEC.select_exprs_.count()), - K(MY_SPEC.alias_names_.strs_.count()), K(ret)); - } - return ret; - } - - //calc first data_writer.url_ and basic_url_ - int ObSelectIntoOp::calc_url_and_set_access_info() - { - int ret = OB_SUCCESS; - const ObItemType into_type = MY_SPEC.into_type_; - ObString path = file_name_.get_varchar().trim(); - if (path.prefix_match_ci(OB_S3_PREFIX)) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "S3 storage"); - LOG_WARN("S3 storage is not supported", K(ret)); - } else if (path.prefix_match_ci(OB_AZBLOB_PREFIX)) { - file_location_ = IntoFileLocation::REMOTE_AZBLOB; - } else if (path.prefix_match_ci(OB_OSS_PREFIX)) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "OSS storage"); - LOG_WARN("OSS storage is not supported", K(ret)); - } else if (path.prefix_match_ci(OB_COS_PREFIX)) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "COS storage"); - LOG_WARN("COS storage is not supported", K(ret)); - } else if (path.prefix_match_ci(OB_HDFS_PREFIX)) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "HDFS storage"); - LOG_WARN("HDFS storage is not supported", K(ret)); - } else { - file_location_ = IntoFileLocation::SERVER_DISK; - } - if (file_location_ == IntoFileLocation::SERVER_DISK && do_partition_) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support partition option on server disk", K(ret)); - LOG_USER_ERROR(OB_NOT_SUPPORTED, "partition option on server disk"); - } else if (T_INTO_OUTFILE == into_type && !MY_SPEC.is_single_ && OB_FAIL(calc_first_file_path(path))) { - LOG_WARN("failed to calc first file path", K(ret)); - } else if (file_location_ != IntoFileLocation::SERVER_DISK) { - ObString temp_url = path.split_on('?'); - temp_url.trim(); - ObString storage_info; - if (OB_FAIL(ob_write_string(ctx_.get_allocator(), temp_url, basic_url_, true))) { - LOG_WARN("failed to append string", K(ret)); - } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), path, storage_info, true))) { - LOG_WARN("failed to append string", K(ret)); - } else if (OB_FAIL(access_info_.set(basic_url_.ptr(), storage_info.ptr()))) { - LOG_WARN("failed to set access info", K(ret), K(path)); - } else if (basic_url_.empty() || !access_info_.is_valid()) { - ret = OB_FILE_NOT_EXIST; - LOG_WARN("file path not exist", K(ret), K(basic_url_), K(access_info_)); - } - } else { // IntoFileLocation::SERVER_DISK - if (OB_FAIL(ob_write_string(ctx_.get_allocator(), path, basic_url_, true))) { - LOG_WARN("failed to write string", K(ret)); - } - } - if (OB_SUCC(ret) && (T_INTO_OUTFILE == into_type || T_INTO_DUMPFILE == into_type) - && IntoFileLocation::SERVER_DISK == file_location_ && OB_FAIL(check_secure_file_path(basic_url_))) { - LOG_WARN("failed to check secure file path", K(ret)); - } - return ret; - } - // csv, odps supports batch and non-batch interfaces; parquet, orc only supports batch interface; non-batch interface will be discontinued later - int ObSelectIntoOp::inner_get_next_row() - { - int ret = 0 == top_limit_cnt_ ? OB_ITER_END : OB_SUCCESS; - int64_t row_count = 0; - const ObItemType into_type = MY_SPEC.into_type_; - ObPhysicalPlanCtx *phy_plan_ctx = NULL; - ObExternalFileWriter *data_writer = NULL; - if (ObExternalFileFormat::FormatType::CSV_FORMAT != format_type_ - && ObExternalFileFormat::FormatType::ODPS_FORMAT != format_type_) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("this type not supported in not batch interface", K(ret), K(format_type_)); - LOG_USER_ERROR(OB_NOT_SUPPORTED, "this upload type"); - } else if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get phy_plan_ctx failed", K(ret)); - } - //when do_partition is false, create the only data_writer here - if (OB_SUCC(ret) && ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ - && T_INTO_VARIABLES != into_type && !do_partition_ - && OB_FAIL(create_the_only_data_writer(data_writer))) { - LOG_WARN("failed to create the only data writer", K(ret)); - } - while (OB_SUCC(ret) && row_count < top_limit_cnt_) { - clear_evaluated_flag(); - if (OB_FAIL(child_->get_next_row())) { - if (OB_LIKELY(OB_ITER_END == ret)) { - } else { - LOG_WARN("get next row failed", K(ret)); - } - } else { - ++row_count; - if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { - if (is_odps_cpp_table_ == is_odps_java_table_) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid table mode for odps table", K(ret), - K(is_odps_cpp_table_), K(is_odps_java_table_)); - } else if (is_odps_cpp_table_) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps cpp table"); - LOG_WARN("use supported version", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps table"); - LOG_WARN("not support jni odps single write", K(ret)); - } - } else if (T_INTO_VARIABLES == into_type) { - if (OB_FAIL(into_varlist())) { - LOG_WARN("into varlist failed", K(ret)); - } - } else if (T_INTO_OUTFILE == into_type) { - if (OB_FAIL(into_outfile(data_writer))) { - LOG_WARN("into outfile failed", K(ret)); - } - } else { - if (OB_FAIL(into_dumpfile(data_writer))) { - LOG_WARN("into dumpfile failed", K(ret)); - } - } - } - if (OB_SUCC(ret) || OB_ITER_END == ret) { // if into user variables or into dumpfile, must be one row - if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ - && (T_INTO_VARIABLES == into_type || T_INTO_DUMPFILE == into_type) && row_count > 1) { - ret = OB_ERR_TOO_MANY_ROWS; - LOG_WARN("more than one row for into variables or into dumpfile", K(ret), K(row_count)); - } - } - } //end while - if (OB_ITER_END == ret || OB_SUCC(ret)) { // set affected rows - phy_plan_ctx->set_affected_rows(row_count); - } - if (OB_FAIL(ret) && OB_ITER_END != ret) { - need_commit_ = false; - } - return ret; - } - - int ObSelectIntoOp::inner_get_next_batch(const int64_t max_row_cnt) - { - int ret = OB_SUCCESS; - const ObBatchRows *child_brs = NULL; - int64_t batch_size = min(max_row_cnt, MY_SPEC.max_batch_size_); - int64_t row_count = 0; - const ObItemType into_type = MY_SPEC.into_type_; - ObPhysicalPlanCtx *phy_plan_ctx = NULL; - ObExternalFileWriter *data_writer = NULL; - bool stop_loop = false; - bool is_iter_end = false; - if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get phy_plan_ctx failed", K(ret)); - } - //when do_partition is false, create the only data_writer here - if (OB_SUCC(ret) && T_INTO_VARIABLES != into_type && !do_partition_ - && (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ - || ObExternalFileFormat::FormatType::PARQUET_FORMAT == format_type_ - )) { - if (OB_FAIL(create_the_only_data_writer(data_writer))) { - LOG_WARN("failed to create the only data writer", K(ret)); - } else if (OB_ISNULL(data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } - } - - if (0 == top_limit_cnt_) { - brs_.size_ = 0; - brs_.end_ = true; - stop_loop = true; - } - while (OB_SUCC(ret) && !stop_loop) { - clear_evaluated_flag(); - int64_t rowkey_batch_size = min(batch_size, top_limit_cnt_ - row_count); - if (OB_FAIL(child_->get_next_batch(rowkey_batch_size, child_brs))) { - LOG_WARN("get next batch failed", K(ret)); - } else { - brs_.size_ = child_brs->size_; - brs_.end_ = child_brs->end_; - is_iter_end = brs_.end_ && 0 == brs_.size_; - if (brs_.size_ > 0) { - brs_.skip_->deep_copy(*(child_brs->skip_), brs_.size_); - row_count += brs_.size_ - brs_.skip_->accumulate_bit_cnt(brs_.size_); - if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { - if (!GCONF._use_odps_jni_connector) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps cpp connector is not supported", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps jni connector is not supported", K(ret)); - } - } else if (T_INTO_OUTFILE == into_type) { - if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_) { - if (OB_FAIL(into_outfile_batch_csv(brs_, data_writer))) { - LOG_WARN("csv into outfile batch failed", K(ret)); - } - } else if (ObExternalFileFormat::FormatType::PARQUET_FORMAT == format_type_) { - #ifndef OB_BUILD_EMBED_MODE - if (OB_FAIL(into_outfile_batch_parquet(brs_, data_writer))) { - LOG_WARN("parquet into outfile batch failed", K(ret)); - } - #else - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet is not supported in embed mode", K(ret)); - #endif // OB_BUILD_EMBED_MODE - } else if (ObExternalFileFormat::FormatType::ORC_FORMAT == format_type_) { - ret = OB_NOT_SUPPORTED; - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support to write into outfile format.", K(ret), K(format_type_)); - } - } else { - ObEvalCtx::BatchInfoScopeGuard guard(eval_ctx_); - guard.set_batch_size(brs_.size_); - for (int64_t i = 0; OB_SUCC(ret) && i < brs_.size_; i++) { - if (brs_.skip_->contain(i)) { - continue; - } - guard.set_batch_idx(i); - if (T_INTO_VARIABLES == into_type) { - if (OB_FAIL(into_varlist())) { - LOG_WARN("into varlist failed", K(ret)); - } - } else { - if (OB_FAIL(into_dumpfile(data_writer))) { - LOG_WARN("into dumpfile failed", K(ret)); - } - } - } - } - } - } - if (is_iter_end || row_count >= top_limit_cnt_) { - stop_loop = true; - } - if (OB_SUCC(ret) || is_iter_end) { // if into user variables or into dumpfile, must be one row - if ((T_INTO_VARIABLES == into_type || T_INTO_DUMPFILE == into_type) && row_count > 1) { - ret = OB_ERR_TOO_MANY_ROWS; - LOG_WARN("more than one row for into variables or into dumpfile", K(ret), K(row_count)); - } - } - } //end while - if (OB_SUCC(ret)) { // set affected rows - phy_plan_ctx->set_affected_rows(row_count); - } - if (OB_FAIL(ret)) { - need_commit_ = false; - } - return ret; - } - - int ObSelectIntoOp::inner_rescan() - { - int ret = OB_SUCCESS; - return ret; - } - - int ObSelectIntoOp::inner_close() - { - int ret = OB_SUCCESS; - ObExternalFileWriter *data_writer = NULL; - int64_t estimated_bytes = 0; - if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { - if (!GCONF._use_odps_jni_connector) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps jni connector is not supported", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps jni connector is not supported", K(ret)); - } - } else if (do_partition_) { - for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); - OB_SUCC(ret) && iter != partition_map_.end(); iter++) { - if (OB_ISNULL(data_writer = iter->second)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("data writer is unexpected null", K(ret)); - } else if (OB_FAIL(data_writer->close_data_writer())) { - LOG_WARN("failed to close data writer", K(ret)); - } - } - } else if (OB_NOT_NULL(data_writer_) && OB_FAIL(data_writer_->close_data_writer())) { - LOG_WARN("failed to close data writer", K(ret)); - } - return ret; - } - - int ObSelectIntoOp::get_row_str(const int64_t buf_len, - bool is_first_row, - char *buf, - int64_t &pos) - { - int ret = OB_SUCCESS; - const ObObj &field_str = field_str_; - char closed_cht = char_enclose_; - //before 4_1 use output - //after 4_1 use select exprs - const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? - MY_SPEC.output_ : MY_SPEC.select_exprs_; - if (!is_first_row && line_str_.is_varying_len_char_type()) { // lines terminated by "a" - ret = databuff_printf(buf, buf_len, pos, "%.*s", line_str_.get_varchar().length(), - line_str_.get_varchar().ptr()); - } - - for (int i = 0 ; OB_SUCC(ret) && i < select_exprs.count() ; i++) { - const ObExpr *expr = select_exprs.at(i); - if (0 != closed_cht && (!is_optional_ || ob_is_string_type(expr->datum_meta_.type_))) { - // closed by "a" (for all cell) or optionally by "a" (for string cell) - if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%c", closed_cht))) { - LOG_WARN("print closed character failed", K(ret), K(closed_cht)); - } - } - if (OB_SUCC(ret)) { - ObObj cell; - ObDatum *datum = NULL; - if (OB_FAIL(expr->eval(eval_ctx_, datum))) { - LOG_WARN("expr eval failed", K(ret)); - } else if (OB_FAIL(datum->to_obj(cell, expr->obj_meta_))) { - LOG_WARN("to obj failed", K(ret)); - } else if (OB_FAIL(cell.print_plain_str_literal(buf, buf_len, pos))) { // cell value - LOG_WARN("print sql failed", K(ret), K(cell)); - } else if (0 != closed_cht && (!is_optional_ || ob_is_string_type(expr->datum_meta_.type_))) { - if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%c", closed_cht))) { - LOG_WARN("print closed character failed", K(ret), K(closed_cht)); - } - } - // field terminated by "a" - if (OB_SUCC(ret) && i != select_exprs.count() - 1 && field_str.is_varying_len_char_type()) { - if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%.*s", field_str.get_varchar().length(), field_str.get_varchar().ptr()))) { - LOG_WARN("print field str failed", K(ret), K(field_str)); - } - } - } - } - - return ret; - } - - int ObSelectIntoOp::calc_first_file_path(ObString &path) - { - int ret = OB_SUCCESS; - ObSqlString file_name_with_suffix; - ObString file_extension; - ObSelectIntoOpInput *input = static_cast(input_); - ObString input_file_name = file_location_ != IntoFileLocation::SERVER_DISK - ? path.split_on('?').trim() - : path; - if (OB_ISNULL(input)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("op input is null", K(ret)); - } else if (input_file_name.length() == 0 || path.length() == 0) { - ret = OB_INVALID_ARGUMENT; - LOG_USER_ERROR(OB_INVALID_ARGUMENT, "invalid outfile path"); - LOG_WARN("invalid outfile path", K(ret)); - } else { - if (input_file_name.ptr()[input_file_name.length() - 1] == '/'){ - OZ(file_name_with_suffix.append_fmt("%.*sdata", input_file_name.length(), input_file_name.ptr())); - } else { - OZ(file_name_with_suffix.append_fmt("%.*s", input_file_name.length(), input_file_name.ptr())); - } - if (MY_SPEC.parallel_ > 1) { - OZ(file_name_with_suffix.append_fmt("_%ld_%ld_%d", input->sqc_id_, input->task_id_, 0)); - } else { - OZ(file_name_with_suffix.append_fmt("_%d", 0)); - } - OZ(external_properties_.get_format_file_extension(format_type_, file_extension)); - if (!file_extension.empty() && file_extension.ptr()[0] != '.') { - OZ(file_name_with_suffix.append(".")); - } - OZ(file_name_with_suffix.append(file_extension)); - if (format_type_ == ObExternalFileFormat::FormatType::CSV_FORMAT) { - OZ(file_name_with_suffix.append(compression_algorithm_to_suffix(external_properties_.csv_format_.compression_algorithm_))); - } - if (file_location_ != IntoFileLocation::SERVER_DISK) { - OZ(file_name_with_suffix.append_fmt("?%.*s", path.length(), path.ptr())); - } - if (OB_SUCC(ret) && OB_FAIL(ob_write_string(ctx_.get_allocator(), file_name_with_suffix.string(), path))) { - LOG_WARN("failed to write string", K(ret)); - } - } - return ret; - } - - int ObSelectIntoOp::calc_next_file_path(ObExternalFileWriter &data_writer) - { - int ret = OB_SUCCESS; - ObSqlString url_with_suffix; - ObString file_path; - data_writer.split_file_id_++; - if (data_writer.split_file_id_ > 0) { - if (MY_SPEC.is_single_ && IntoFileLocation::SERVER_DISK != file_location_) { - file_path = (data_writer.split_file_id_ > 1) - ? data_writer.url_.split_on(data_writer.url_.reverse_find('.')) - : data_writer.url_; - if (OB_FAIL(url_with_suffix.assign(file_path))) { - LOG_WARN("failed to assign string", K(ret)); - } else if (OB_FAIL(url_with_suffix.append_fmt(".extend%ld", data_writer.split_file_id_))) { - LOG_WARN("failed to append string", K(ret)); - } - } else if (!MY_SPEC.is_single_) { - file_path = data_writer.url_.split_on(data_writer.url_.reverse_find('_')); - if (OB_FAIL(url_with_suffix.assign(file_path))) { - LOG_WARN("failed to assign string", K(ret)); - } else if (OB_FAIL(url_with_suffix.append_fmt("_%ld", data_writer.split_file_id_))) { - LOG_WARN("failed to append string", K(ret)); - } - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected single value", K(ret)); - } - if (!MY_SPEC.is_single_) { - ObString file_extension; - OZ(external_properties_.get_format_file_extension(format_type_, file_extension)); - if (!file_extension.empty() && file_extension.ptr()[0] != '.') { - OZ(url_with_suffix.append(".")); - } - OZ(url_with_suffix.append(file_extension)); - } - if (!MY_SPEC.is_single_ - && format_type_ == ObExternalFileFormat::FormatType::CSV_FORMAT) { - OZ(url_with_suffix.append(compression_algorithm_to_suffix(external_properties_.csv_format_.compression_algorithm_))); - } - if (OB_SUCC(ret) && OB_FAIL(ob_write_string(ctx_.get_allocator(), - url_with_suffix.string(), - data_writer.url_, true))) { - LOG_WARN("failed to write string", K(ret)); - } - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected split file id", K(ret)); - } - return ret; - } - // Set the current data_writer's url_ based on the incoming partition and basic_url_, each partition only needs to be calculated once, subsequent changes only need to modify the split id - int ObSelectIntoOp::calc_file_path_with_partition(ObString partition, ObExternalFileWriter &data_writer) - { - int ret = OB_SUCCESS; - ObSqlString url_with_partition; - ObString dir_path; - if (OB_FAIL(ob_write_string(ctx_.get_allocator(), basic_url_, data_writer.url_))) { - LOG_WARN("failed to write string", K(ret)); - } else { - dir_path = data_writer.url_.split_on(data_writer.url_.reverse_find('/')); - if (OB_FAIL(url_with_partition.assign(dir_path))) { - LOG_WARN("failed to assign string", K(ret)); - } else if (url_with_partition.length() != 0 && OB_FAIL(url_with_partition.append("/"))) { - LOG_WARN("failed to append string", K(ret)); - } else if (partition.length() != 0 && OB_FAIL(url_with_partition.append_fmt("%.*s/", - partition.length(), - partition.ptr()))) { - LOG_WARN("failed to append string", K(ret)); - } else if (partition.length() == 0 && OB_FAIL(url_with_partition.append("__NULL__/"))) { - LOG_WARN("failed to append string", K(ret)); - } else if (OB_FAIL(url_with_partition.append_fmt("%.*s", - data_writer.url_.length(), - data_writer.url_.ptr()))) { - LOG_WARN("failed to append string", K(ret)); - } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), - url_with_partition.string(), - data_writer.url_, - true))) { - LOG_WARN("failed to write string", K(ret)); - } - } - return ret; - } - - int ObSelectIntoOp::split_file(ObExternalFileWriter &data_writer) - { - int ret = OB_SUCCESS; - if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_) { - ObCsvFileWriter *csv_data_writer = static_cast(&data_writer); - if (OB_ISNULL(csv_data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null data writer", K(ret)); - } else if (!use_shared_buf_ && OB_FAIL(csv_data_writer->flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (has_lob_ && use_shared_buf_ && OB_FAIL(csv_data_writer->flush_shared_buf(shared_buf_))) { - // To ensure the integrity of each line in the file, when there is a lob, the shared buffer may not contain a complete line - // Therefore the remaining content in the shared buffer also needs to be flushed to the current file, in this case, the max_file_size limit cannot be strictly enforced - LOG_WARN("failed to flush shared buffer", K(ret)); - } - } - if (OB_FAIL(ret)) { - } else if (OB_FAIL(data_writer.close_file())) { - LOG_WARN("failed to close file", K(ret)); - } else if (OB_FAIL(calc_next_file_path(data_writer))) { - LOG_WARN("failed to calculate new file path", K(ret)); - } - return ret; - } - - int ObSelectIntoOp::check_csv_file_size(ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - int64_t curr_bytes = data_writer.get_file_size(); - int64_t curr_bytes_exclude_curr_line = data_writer.get_curr_bytes_exclude_curr_line(); - int64_t curr_line_len = curr_bytes - curr_bytes_exclude_curr_line; - bool has_split = false; - bool has_use_shared_buf = use_shared_buf_; - if (has_compress_ && OB_ISNULL(data_writer.get_compress_stream_writer())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null compress stream writer", K(ret)); - } else if (!(has_lob_ && has_use_shared_buf) && curr_bytes_exclude_curr_line == 0) { - } else if (file_need_split(curr_bytes)) { - if (OB_FAIL(split_file(data_writer))) { - LOG_WARN("failed to split file", K(ret)); - } else { - has_split = true; - } - } - if (OB_SUCC(ret)) { - if (has_lob_ && has_use_shared_buf) { - if (!has_compress_) { - data_writer.set_write_bytes(has_split ? 0 : curr_bytes); - } - data_writer.reset_curr_line_len(); - } else { - if (!has_compress_) { - data_writer.set_write_bytes(has_split ? curr_line_len : curr_bytes); - } - } - if (has_compress_ && has_split) { - data_writer.get_compress_stream_writer()->reuse(); - } - data_writer.update_last_line_pos(); - } - return ret; - } - - int ObSelectIntoOp::get_buf(char* &buf, int64_t &buf_len, int64_t &pos, ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - buf = use_shared_buf_ ? get_shared_buf() : data_writer.get_buf(); - buf_len = use_shared_buf_ ? get_shared_buf_len() : data_writer.get_buf_len(); - pos = data_writer.get_curr_pos(); - if (OB_ISNULL(buf) && !use_shared_buf_ && OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } else if (OB_ISNULL(buf)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("buf should not be null", K(ret)); - } - return ret; - } - - int ObSelectIntoOp::use_shared_buf(ObCsvFileWriter &data_writer, - char* &buf, - int64_t &buf_len, - int64_t &pos) - { - int ret = OB_SUCCESS; - int64_t curr_pos = data_writer.get_curr_pos(); - if (!use_shared_buf_ && data_writer.get_last_line_pos() == 0) { - if (OB_NOT_NULL(data_writer.get_buf()) && curr_pos > 0) { - MEMCPY(shared_buf_, data_writer.get_buf(), curr_pos); - } - use_shared_buf_ = true; - buf = shared_buf_; - buf_len = shared_buf_len_; - pos = curr_pos; - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("last line should be flushed before this line copied", K(ret)); - } - return ret; - } - - int ObSelectIntoOp::resize_buf(char* &buf, - int64_t &buf_len, - int64_t &pos, - int64_t curr_pos, - bool is_json) - { - int ret = OB_SUCCESS; - int64_t new_buf_len = buf_len * 2; - char* new_buf = NULL; - if (OB_ISNULL(new_buf = static_cast(ctx_.get_allocator().alloc(new_buf_len)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate buffer", K(ret), K(new_buf_len)); - } else if (!is_json) { - if (curr_pos > 0) { - MEMCPY(new_buf, shared_buf_, curr_pos); - } - shared_buf_ = new_buf; - shared_buf_len_ = new_buf_len; - } else { - json_buf_ = new_buf; - json_buf_len_ = new_buf_len; - } - if (OB_SUCC(ret)) { - buf = new_buf; - buf_len = new_buf_len; - pos = is_json ? 0 : curr_pos; - } - return ret; - } - - int ObSelectIntoOp::resize_or_flush_shared_buf(ObCsvFileWriter &data_writer, - char* &buf, - int64_t &buf_len, - int64_t &pos) - { - int ret = OB_SUCCESS; - if (!use_shared_buf_) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get invalid argument", K(use_shared_buf_), K(ret)); - } else if (has_lob_ && data_writer.get_curr_pos() > 0) { - if (OB_FAIL(data_writer.flush_shared_buf(shared_buf_, true))) { - LOG_WARN("failed to flush shared buffer", K(ret)); - } else { - pos = 0; - } - } else if (OB_FAIL(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos()))) { - LOG_WARN("failed to resize shared buffer", K(ret)); - } - return ret; - } - - int ObSelectIntoOp::check_buf_sufficient(ObCsvFileWriter &data_writer, +* Copyright (c) 2025 OceanBase. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#define USING_LOG_PREFIX SQL_ENG + +#include +#include + +#include "ob_select_into_op.h" +#include "sql/engine/cmd/ob_variable_set_executor.h" +#include "lib/charset/ob_charset_string_helper.h" +#include "sql/engine/px/ob_px_sqc_handler.h" +#include "sql/engine/expr/ob_expr_json_func_helper.h" +#include "lib/udt/ob_collection_type.h" +#include "share/config/ob_server_config.h" + +#ifndef OB_BUILD_EMBED_MODE +#include +#include +#include +#include +#include +#include +#include + +#define ARROW_FAIL(statement) (OB_UNLIKELY(!(statement).ok())) + +#endif + +namespace oceanbase +{ +using namespace common; +namespace sql +{ + +OB_SERIALIZE_MEMBER(ObSelectIntoOpInput, task_id_, sqc_id_); +OB_SERIALIZE_MEMBER((ObSelectIntoSpec, ObOpSpec), into_type_, user_vars_, outfile_name_, + field_str_, // FARM COMPAT WHITELIST FOR filed_str_: renamed + line_str_, closed_cht_, is_optional_, select_exprs_, is_single_, max_file_size_, + escaped_cht_, cs_type_, parallel_, file_partition_expr_, buffer_size_, is_overwrite_, + external_properties_, external_partition_, alias_names_); + + +int ObSelectIntoOp::inner_open() +{ + int ret = OB_SUCCESS; + ObSQLSessionInfo *session = NULL; + if (OB_ISNULL(session = ctx_.get_my_session())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get session failed", K(ret)); + } else { + // since we call get_next_row in inner_open, we have to set opened_ first in avoid to a infinite loop. + opened_ = true; + if (OB_FAIL(session->get_sql_select_limit(top_limit_cnt_))) { + LOG_WARN("fail tp get sql select limit", K(ret)); + } + } + if (OB_SUCC(ret) && !MY_SPEC.external_properties_.str_.empty()) { + if (OB_FAIL(external_properties_.load_from_string(MY_SPEC.external_properties_.str_, + ctx_.get_allocator()))) { + LOG_WARN("failed to load external properties", K(ret)); + } else { + format_type_ = external_properties_.format_type_; + } + } + if (OB_SUCC(ret)) { + switch (format_type_) + { + case ObExternalFileFormat::FormatType::CSV_FORMAT: + { + if (OB_FAIL(init_csv_env())) { + LOG_WARN("failed to init csv env", K(ret)); + } + break; + } + case ObExternalFileFormat::FormatType::ODPS_FORMAT: + { + if (!GCONF._use_odps_jni_connector) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support odps format", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support odps format", K(ret)); + } + break; + } + case ObExternalFileFormat::FormatType::PARQUET_FORMAT: + { +#ifndef OB_BUILD_EMBED_MODE + if (OB_FAIL(init_parquet_env())) { + LOG_WARN("failed to init parquet env", K(ret)); + } +#endif + break; + } + case ObExternalFileFormat::FormatType::ORC_FORMAT: + { + ret = OB_NOT_SUPPORTED; + break; + } + default: + { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support select into type", K(format_type_)); + } + } + } + return ret; +} + +int ObSelectIntoOp::init_csv_env() +{ + int ret = OB_SUCCESS; + ObSQLSessionInfo *session = NULL; + set_csv_format_options(); + if (OB_ISNULL(session = ctx_.get_my_session())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get session failed", K(ret)); + } else if (OB_FAIL(init_env_common())) { + LOG_WARN("failed to init env common", K(ret)); + } else if (OB_FAIL(prepare_escape_printer())) { + LOG_WARN("failed to calc escape info", K(ret)); + } else { + if (external_properties_.csv_format_.compression_algorithm_ != CsvCompressType::NONE) { + has_compress_ = true; + } + // setup binary output format for bit/binary + switch (external_properties_.csv_format_.binary_format_) { + case ObCSVGeneralFormat::ObCSVBinaryFormat::DEFAULT: + print_params_.binary_string_print_hex_ = false; + break; + case ObCSVGeneralFormat::ObCSVBinaryFormat::HEX: + print_params_.binary_string_print_hex_ = true; + break; + case ObCSVGeneralFormat::ObCSVBinaryFormat::BASE64: + print_params_.binary_string_print_base64_ = true; + break; + default: + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to set csv binary output format", K(ret)); + } + print_params_.tz_info_ = session->get_timezone_info(); + print_params_.use_memcpy_ = true; + print_params_.cs_type_ = cs_type_; + } + //create buffer + if (OB_SUCC(ret) && T_INTO_OUTFILE == MY_SPEC.into_type_ && OB_FAIL(create_shared_buffer_for_data_writer())) { + LOG_WARN("failed to create buffer for data writer", K(ret)); + } + return ret; +} + +void ObSelectIntoOp::set_csv_format_options() +{ + if (MY_SPEC.external_properties_.str_.empty()) { + field_str_ = MY_SPEC.field_str_; + line_str_ = MY_SPEC.line_str_; + has_enclose_ = MY_SPEC.closed_cht_.get_val_len() > 0; + char_enclose_ = has_enclose_ ? MY_SPEC.closed_cht_.get_char().ptr()[0] : 0; + is_optional_ = MY_SPEC.is_optional_; + has_escape_ = MY_SPEC.escaped_cht_.get_val_len() > 0; + char_escape_ = has_escape_ ? MY_SPEC.escaped_cht_.get_char().ptr()[0] : 0; + cs_type_ = MY_SPEC.cs_type_; + } else { + is_optional_ = external_properties_.csv_format_.is_optional_; + cs_type_ = ObCharset::get_default_collation(external_properties_.csv_format_.cs_type_); + field_str_.set_varchar(external_properties_.csv_format_.field_term_str_); + field_str_.set_collation_type(cs_type_); + line_str_.set_varchar(external_properties_.csv_format_.line_term_str_); + line_str_.set_collation_type(cs_type_); + if (external_properties_.csv_format_.field_enclosed_char_ == INT64_MAX) { // null + has_enclose_ = false; + char_enclose_ = 0; + } else { + has_enclose_ = true; + char_enclose_ = external_properties_.csv_format_.field_enclosed_char_; + } + if (external_properties_.csv_format_.field_escaped_char_ == INT64_MAX) { // null + has_escape_ = false; + char_escape_ = 0; + } else { + has_escape_ = true; + char_escape_ = external_properties_.csv_format_.field_escaped_char_; + } + } +} + +int ObSelectIntoOp::init_env_common() +{ + int ret = OB_SUCCESS; + ObPhysicalPlanCtx *phy_plan_ctx = NULL; + bool need_check = false; + file_name_ = MY_SPEC.outfile_name_; + do_partition_ = MY_SPEC.file_partition_expr_ == NULL ? false : true; + if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get phy_plan_ctx failed", K(ret)); + } else if (OB_FAIL(ObSQLUtils::get_param_value(MY_SPEC.outfile_name_, + phy_plan_ctx->get_param_store(), + file_name_, + need_check))) { + LOG_WARN("get param value failed", K(ret)); + } else if (OB_FAIL(calc_url_and_set_access_info())) { + LOG_WARN("failed to calc basic url and set device handle", K(ret)); + } else if (OB_FAIL(check_has_lob_or_json())) { + LOG_WARN("failed to check has lob", K(ret)); + } else if (has_coll_ && MY_SPEC.into_type_ == T_INTO_VARIABLES) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "select array/map into variables"); + } else if (do_partition_ + && OB_FAIL(partition_map_.create(128, ObLabel("SelectInto"), ObLabel("SelectInto"), MTL_ID()))) { + LOG_WARN("failed to create hashmap", K(ret)); + } else if (MY_SPEC.select_exprs_.count() != MY_SPEC.alias_names_.strs_.count()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected column count", K(MY_SPEC.select_exprs_.count()), + K(MY_SPEC.alias_names_.strs_.count()), K(ret)); + } + return ret; +} + +//calc first data_writer.url_ and basic_url_ +int ObSelectIntoOp::calc_url_and_set_access_info() +{ + int ret = OB_SUCCESS; + const ObItemType into_type = MY_SPEC.into_type_; + ObString path = file_name_.get_varchar().trim(); + if (path.prefix_match_ci(OB_S3_PREFIX)) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "S3 storage"); + LOG_WARN("S3 storage is not supported", K(ret)); + } else if (path.prefix_match_ci(OB_AZBLOB_PREFIX)) { + file_location_ = IntoFileLocation::REMOTE_AZBLOB; + } else if (path.prefix_match_ci(OB_OSS_PREFIX)) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "OSS storage"); + LOG_WARN("OSS storage is not supported", K(ret)); + } else if (path.prefix_match_ci(OB_COS_PREFIX)) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "COS storage"); + LOG_WARN("COS storage is not supported", K(ret)); + } else if (path.prefix_match_ci(OB_HDFS_PREFIX)) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "HDFS storage"); + LOG_WARN("HDFS storage is not supported", K(ret)); + } else { + file_location_ = IntoFileLocation::SERVER_DISK; + } + if (file_location_ == IntoFileLocation::SERVER_DISK && do_partition_) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support partition option on server disk", K(ret)); + LOG_USER_ERROR(OB_NOT_SUPPORTED, "partition option on server disk"); + } else if (T_INTO_OUTFILE == into_type && !MY_SPEC.is_single_ && OB_FAIL(calc_first_file_path(path))) { + LOG_WARN("failed to calc first file path", K(ret)); + } else if (file_location_ != IntoFileLocation::SERVER_DISK) { + ObString temp_url = path.split_on('?'); + temp_url.trim(); + ObString storage_info; + if (OB_FAIL(ob_write_string(ctx_.get_allocator(), temp_url, basic_url_, true))) { + LOG_WARN("failed to append string", K(ret)); + } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), path, storage_info, true))) { + LOG_WARN("failed to append string", K(ret)); + } else if (OB_FAIL(access_info_.set(basic_url_.ptr(), storage_info.ptr()))) { + LOG_WARN("failed to set access info", K(ret), K(path)); + } else if (basic_url_.empty() || !access_info_.is_valid()) { + ret = OB_FILE_NOT_EXIST; + LOG_WARN("file path not exist", K(ret), K(basic_url_), K(access_info_)); + } + } else { // IntoFileLocation::SERVER_DISK + if (OB_FAIL(ob_write_string(ctx_.get_allocator(), path, basic_url_, true))) { + LOG_WARN("failed to write string", K(ret)); + } + } + if (OB_SUCC(ret) && (T_INTO_OUTFILE == into_type || T_INTO_DUMPFILE == into_type) + && IntoFileLocation::SERVER_DISK == file_location_ && OB_FAIL(check_secure_file_path(basic_url_))) { + LOG_WARN("failed to check secure file path", K(ret)); + } + return ret; +} +// csv, odps supports batch and non-batch interfaces; parquet, orc only supports batch interface; non-batch interface will be discontinued later +int ObSelectIntoOp::inner_get_next_row() +{ + int ret = 0 == top_limit_cnt_ ? OB_ITER_END : OB_SUCCESS; + int64_t row_count = 0; + const ObItemType into_type = MY_SPEC.into_type_; + ObPhysicalPlanCtx *phy_plan_ctx = NULL; + ObExternalFileWriter *data_writer = NULL; + if (ObExternalFileFormat::FormatType::CSV_FORMAT != format_type_ + && ObExternalFileFormat::FormatType::ODPS_FORMAT != format_type_) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("this type not supported in not batch interface", K(ret), K(format_type_)); + LOG_USER_ERROR(OB_NOT_SUPPORTED, "this upload type"); + } else if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get phy_plan_ctx failed", K(ret)); + } + //when do_partition is false, create the only data_writer here + if (OB_SUCC(ret) && ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ + && T_INTO_VARIABLES != into_type && !do_partition_ + && OB_FAIL(create_the_only_data_writer(data_writer))) { + LOG_WARN("failed to create the only data writer", K(ret)); + } + while (OB_SUCC(ret) && row_count < top_limit_cnt_) { + clear_evaluated_flag(); + if (OB_FAIL(child_->get_next_row())) { + if (OB_LIKELY(OB_ITER_END == ret)) { + } else { + LOG_WARN("get next row failed", K(ret)); + } + } else { + ++row_count; + if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { + if (is_odps_cpp_table_ == is_odps_java_table_) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid table mode for odps table", K(ret), + K(is_odps_cpp_table_), K(is_odps_java_table_)); + } else if (is_odps_cpp_table_) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps cpp table"); + LOG_WARN("use supported version", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps table"); + LOG_WARN("not support jni odps single write", K(ret)); + } + } else if (T_INTO_VARIABLES == into_type) { + if (OB_FAIL(into_varlist())) { + LOG_WARN("into varlist failed", K(ret)); + } + } else if (T_INTO_OUTFILE == into_type) { + if (OB_FAIL(into_outfile(data_writer))) { + LOG_WARN("into outfile failed", K(ret)); + } + } else { + if (OB_FAIL(into_dumpfile(data_writer))) { + LOG_WARN("into dumpfile failed", K(ret)); + } + } + } + if (OB_SUCC(ret) || OB_ITER_END == ret) { // if into user variables or into dumpfile, must be one row + if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ + && (T_INTO_VARIABLES == into_type || T_INTO_DUMPFILE == into_type) && row_count > 1) { + ret = OB_ERR_TOO_MANY_ROWS; + LOG_WARN("more than one row for into variables or into dumpfile", K(ret), K(row_count)); + } + } + } //end while + if (OB_ITER_END == ret || OB_SUCC(ret)) { // set affected rows + phy_plan_ctx->set_affected_rows(row_count); + } + if (OB_FAIL(ret) && OB_ITER_END != ret) { + need_commit_ = false; + } + return ret; +} + +int ObSelectIntoOp::inner_get_next_batch(const int64_t max_row_cnt) +{ + int ret = OB_SUCCESS; + const ObBatchRows *child_brs = NULL; + int64_t batch_size = min(max_row_cnt, MY_SPEC.max_batch_size_); + int64_t row_count = 0; + const ObItemType into_type = MY_SPEC.into_type_; + ObPhysicalPlanCtx *phy_plan_ctx = NULL; + ObExternalFileWriter *data_writer = NULL; + bool stop_loop = false; + bool is_iter_end = false; + if (OB_ISNULL(phy_plan_ctx = ctx_.get_physical_plan_ctx())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get phy_plan_ctx failed", K(ret)); + } + //when do_partition is false, create the only data_writer here + if (OB_SUCC(ret) && T_INTO_VARIABLES != into_type && !do_partition_ + && (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ + || ObExternalFileFormat::FormatType::PARQUET_FORMAT == format_type_ + )) { + if (OB_FAIL(create_the_only_data_writer(data_writer))) { + LOG_WARN("failed to create the only data writer", K(ret)); + } else if (OB_ISNULL(data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } + } + + if (0 == top_limit_cnt_) { + brs_.size_ = 0; + brs_.end_ = true; + stop_loop = true; + } + while (OB_SUCC(ret) && !stop_loop) { + clear_evaluated_flag(); + int64_t rowkey_batch_size = min(batch_size, top_limit_cnt_ - row_count); + if (OB_FAIL(child_->get_next_batch(rowkey_batch_size, child_brs))) { + LOG_WARN("get next batch failed", K(ret)); + } else { + brs_.size_ = child_brs->size_; + brs_.end_ = child_brs->end_; + is_iter_end = brs_.end_ && 0 == brs_.size_; + if (brs_.size_ > 0) { + brs_.skip_->deep_copy(*(child_brs->skip_), brs_.size_); + row_count += brs_.size_ - brs_.skip_->accumulate_bit_cnt(brs_.size_); + if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { + if (!GCONF._use_odps_jni_connector) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps cpp connector is not supported", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps jni connector is not supported", K(ret)); + } + } else if (T_INTO_OUTFILE == into_type) { + if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_) { + if (OB_FAIL(into_outfile_batch_csv(brs_, data_writer))) { + LOG_WARN("csv into outfile batch failed", K(ret)); + } + } else if (ObExternalFileFormat::FormatType::PARQUET_FORMAT == format_type_) { +#ifndef OB_BUILD_EMBED_MODE + if (OB_FAIL(into_outfile_batch_parquet(brs_, data_writer))) { + LOG_WARN("parquet into outfile batch failed", K(ret)); + } +#else + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet is not supported in embed mode", K(ret)); +#endif // OB_BUILD_EMBED_MODE + } else if (ObExternalFileFormat::FormatType::ORC_FORMAT == format_type_) { + ret = OB_NOT_SUPPORTED; + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support to write into outfile format.", K(ret), K(format_type_)); + } + } else { + ObEvalCtx::BatchInfoScopeGuard guard(eval_ctx_); + guard.set_batch_size(brs_.size_); + for (int64_t i = 0; OB_SUCC(ret) && i < brs_.size_; i++) { + if (brs_.skip_->contain(i)) { + continue; + } + guard.set_batch_idx(i); + if (T_INTO_VARIABLES == into_type) { + if (OB_FAIL(into_varlist())) { + LOG_WARN("into varlist failed", K(ret)); + } + } else { + if (OB_FAIL(into_dumpfile(data_writer))) { + LOG_WARN("into dumpfile failed", K(ret)); + } + } + } + } + } + } + if (is_iter_end || row_count >= top_limit_cnt_) { + stop_loop = true; + } + if (OB_SUCC(ret) || is_iter_end) { // if into user variables or into dumpfile, must be one row + if ((T_INTO_VARIABLES == into_type || T_INTO_DUMPFILE == into_type) && row_count > 1) { + ret = OB_ERR_TOO_MANY_ROWS; + LOG_WARN("more than one row for into variables or into dumpfile", K(ret), K(row_count)); + } + } + } //end while + if (OB_SUCC(ret)) { // set affected rows + phy_plan_ctx->set_affected_rows(row_count); + } + if (OB_FAIL(ret)) { + need_commit_ = false; + } + return ret; +} + +int ObSelectIntoOp::inner_rescan() +{ + int ret = OB_SUCCESS; + return ret; +} + +int ObSelectIntoOp::inner_close() +{ + int ret = OB_SUCCESS; + ObExternalFileWriter *data_writer = NULL; + int64_t estimated_bytes = 0; + if (ObExternalFileFormat::FormatType::ODPS_FORMAT == format_type_) { + if (!GCONF._use_odps_jni_connector) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps jni connector is not supported", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps jni connector is not supported", K(ret)); + } + } else if (do_partition_) { + for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); + OB_SUCC(ret) && iter != partition_map_.end(); iter++) { + if (OB_ISNULL(data_writer = iter->second)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("data writer is unexpected null", K(ret)); + } else if (OB_FAIL(data_writer->close_data_writer())) { + LOG_WARN("failed to close data writer", K(ret)); + } + } + } else if (OB_NOT_NULL(data_writer_) && OB_FAIL(data_writer_->close_data_writer())) { + LOG_WARN("failed to close data writer", K(ret)); + } + return ret; +} + +int ObSelectIntoOp::get_row_str(const int64_t buf_len, + bool is_first_row, + char *buf, + int64_t &pos) +{ + int ret = OB_SUCCESS; + const ObObj &field_str = field_str_; + char closed_cht = char_enclose_; + //before 4_1 use output + //after 4_1 use select exprs + const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? + MY_SPEC.output_ : MY_SPEC.select_exprs_; + if (!is_first_row && line_str_.is_varying_len_char_type()) { // lines terminated by "a" + ret = databuff_printf(buf, buf_len, pos, "%.*s", line_str_.get_varchar().length(), + line_str_.get_varchar().ptr()); + } + + for (int i = 0 ; OB_SUCC(ret) && i < select_exprs.count() ; i++) { + const ObExpr *expr = select_exprs.at(i); + if (0 != closed_cht && (!is_optional_ || ob_is_string_type(expr->datum_meta_.type_))) { + // closed by "a" (for all cell) or optionally by "a" (for string cell) + if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%c", closed_cht))) { + LOG_WARN("print closed character failed", K(ret), K(closed_cht)); + } + } + if (OB_SUCC(ret)) { + ObObj cell; + ObDatum *datum = NULL; + if (OB_FAIL(expr->eval(eval_ctx_, datum))) { + LOG_WARN("expr eval failed", K(ret)); + } else if (OB_FAIL(datum->to_obj(cell, expr->obj_meta_))) { + LOG_WARN("to obj failed", K(ret)); + } else if (OB_FAIL(cell.print_plain_str_literal(buf, buf_len, pos))) { // cell value + LOG_WARN("print sql failed", K(ret), K(cell)); + } else if (0 != closed_cht && (!is_optional_ || ob_is_string_type(expr->datum_meta_.type_))) { + if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%c", closed_cht))) { + LOG_WARN("print closed character failed", K(ret), K(closed_cht)); + } + } + // field terminated by "a" + if (OB_SUCC(ret) && i != select_exprs.count() - 1 && field_str.is_varying_len_char_type()) { + if (OB_FAIL(databuff_printf(buf, buf_len, pos, "%.*s", field_str.get_varchar().length(), field_str.get_varchar().ptr()))) { + LOG_WARN("print field str failed", K(ret), K(field_str)); + } + } + } + } + + return ret; +} + +int ObSelectIntoOp::calc_first_file_path(ObString &path) +{ + int ret = OB_SUCCESS; + ObSqlString file_name_with_suffix; + ObString file_extension; + ObSelectIntoOpInput *input = static_cast(input_); + ObString input_file_name = file_location_ != IntoFileLocation::SERVER_DISK + ? path.split_on('?').trim() + : path; + if (OB_ISNULL(input)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("op input is null", K(ret)); + } else if (input_file_name.length() == 0 || path.length() == 0) { + ret = OB_INVALID_ARGUMENT; + LOG_USER_ERROR(OB_INVALID_ARGUMENT, "invalid outfile path"); + LOG_WARN("invalid outfile path", K(ret)); + } else { + if (input_file_name.ptr()[input_file_name.length() - 1] == '/'){ + OZ(file_name_with_suffix.append_fmt("%.*sdata", input_file_name.length(), input_file_name.ptr())); + } else { + OZ(file_name_with_suffix.append_fmt("%.*s", input_file_name.length(), input_file_name.ptr())); + } + if (MY_SPEC.parallel_ > 1) { + OZ(file_name_with_suffix.append_fmt("_%ld_%ld_%d", input->sqc_id_, input->task_id_, 0)); + } else { + OZ(file_name_with_suffix.append_fmt("_%d", 0)); + } + OZ(external_properties_.get_format_file_extension(format_type_, file_extension)); + if (!file_extension.empty() && file_extension.ptr()[0] != '.') { + OZ(file_name_with_suffix.append(".")); + } + OZ(file_name_with_suffix.append(file_extension)); + if (format_type_ == ObExternalFileFormat::FormatType::CSV_FORMAT) { + OZ(file_name_with_suffix.append(compression_algorithm_to_suffix(external_properties_.csv_format_.compression_algorithm_))); + } + if (file_location_ != IntoFileLocation::SERVER_DISK) { + OZ(file_name_with_suffix.append_fmt("?%.*s", path.length(), path.ptr())); + } + if (OB_SUCC(ret) && OB_FAIL(ob_write_string(ctx_.get_allocator(), file_name_with_suffix.string(), path))) { + LOG_WARN("failed to write string", K(ret)); + } + } + return ret; +} + +int ObSelectIntoOp::calc_next_file_path(ObExternalFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + ObSqlString url_with_suffix; + ObString file_path; + data_writer.split_file_id_++; + if (data_writer.split_file_id_ > 0) { + if (MY_SPEC.is_single_ && IntoFileLocation::SERVER_DISK != file_location_) { + file_path = (data_writer.split_file_id_ > 1) + ? data_writer.url_.split_on(data_writer.url_.reverse_find('.')) + : data_writer.url_; + if (OB_FAIL(url_with_suffix.assign(file_path))) { + LOG_WARN("failed to assign string", K(ret)); + } else if (OB_FAIL(url_with_suffix.append_fmt(".extend%ld", data_writer.split_file_id_))) { + LOG_WARN("failed to append string", K(ret)); + } + } else if (!MY_SPEC.is_single_) { + file_path = data_writer.url_.split_on(data_writer.url_.reverse_find('_')); + if (OB_FAIL(url_with_suffix.assign(file_path))) { + LOG_WARN("failed to assign string", K(ret)); + } else if (OB_FAIL(url_with_suffix.append_fmt("_%ld", data_writer.split_file_id_))) { + LOG_WARN("failed to append string", K(ret)); + } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected single value", K(ret)); + } + if (!MY_SPEC.is_single_) { + ObString file_extension; + OZ(external_properties_.get_format_file_extension(format_type_, file_extension)); + if (!file_extension.empty() && file_extension.ptr()[0] != '.') { + OZ(url_with_suffix.append(".")); + } + OZ(url_with_suffix.append(file_extension)); + } + if (!MY_SPEC.is_single_ + && format_type_ == ObExternalFileFormat::FormatType::CSV_FORMAT) { + OZ(url_with_suffix.append(compression_algorithm_to_suffix(external_properties_.csv_format_.compression_algorithm_))); + } + if (OB_SUCC(ret) && OB_FAIL(ob_write_string(ctx_.get_allocator(), + url_with_suffix.string(), + data_writer.url_, true))) { + LOG_WARN("failed to write string", K(ret)); + } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected split file id", K(ret)); + } + return ret; +} +// Set the current data_writer's url_ based on the incoming partition and basic_url_, each partition only needs to be calculated once, subsequent changes only need to modify the split id +int ObSelectIntoOp::calc_file_path_with_partition(ObString partition, ObExternalFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + ObSqlString url_with_partition; + ObString dir_path; + if (OB_FAIL(ob_write_string(ctx_.get_allocator(), basic_url_, data_writer.url_))) { + LOG_WARN("failed to write string", K(ret)); + } else { + dir_path = data_writer.url_.split_on(data_writer.url_.reverse_find('/')); + if (OB_FAIL(url_with_partition.assign(dir_path))) { + LOG_WARN("failed to assign string", K(ret)); + } else if (url_with_partition.length() != 0 && OB_FAIL(url_with_partition.append("/"))) { + LOG_WARN("failed to append string", K(ret)); + } else if (partition.length() != 0 && OB_FAIL(url_with_partition.append_fmt("%.*s/", + partition.length(), + partition.ptr()))) { + LOG_WARN("failed to append string", K(ret)); + } else if (partition.length() == 0 && OB_FAIL(url_with_partition.append("__NULL__/"))) { + LOG_WARN("failed to append string", K(ret)); + } else if (OB_FAIL(url_with_partition.append_fmt("%.*s", + data_writer.url_.length(), + data_writer.url_.ptr()))) { + LOG_WARN("failed to append string", K(ret)); + } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), + url_with_partition.string(), + data_writer.url_, + true))) { + LOG_WARN("failed to write string", K(ret)); + } + } + return ret; +} + +int ObSelectIntoOp::split_file(ObExternalFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_) { + ObCsvFileWriter *csv_data_writer = static_cast(&data_writer); + if (OB_ISNULL(csv_data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null data writer", K(ret)); + } else if (!use_shared_buf_ && OB_FAIL(csv_data_writer->flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (has_lob_ && use_shared_buf_ && OB_FAIL(csv_data_writer->flush_shared_buf(shared_buf_))) { + // To ensure the integrity of each line in the file, when there is a lob, the shared buffer may not contain a complete line + // Therefore the remaining content in the shared buffer also needs to be flushed to the current file, in this case, the max_file_size limit cannot be strictly enforced + LOG_WARN("failed to flush shared buffer", K(ret)); + } + } + if (OB_FAIL(ret)) { + } else if (OB_FAIL(data_writer.close_file())) { + LOG_WARN("failed to close file", K(ret)); + } else if (OB_FAIL(calc_next_file_path(data_writer))) { + LOG_WARN("failed to calculate new file path", K(ret)); + } + return ret; +} + +int ObSelectIntoOp::check_csv_file_size(ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + int64_t curr_bytes = data_writer.get_file_size(); + int64_t curr_bytes_exclude_curr_line = data_writer.get_curr_bytes_exclude_curr_line(); + int64_t curr_line_len = curr_bytes - curr_bytes_exclude_curr_line; + bool has_split = false; + bool has_use_shared_buf = use_shared_buf_; + if (has_compress_ && OB_ISNULL(data_writer.get_compress_stream_writer())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null compress stream writer", K(ret)); + } else if (!(has_lob_ && has_use_shared_buf) && curr_bytes_exclude_curr_line == 0) { + } else if (file_need_split(curr_bytes)) { + if (OB_FAIL(split_file(data_writer))) { + LOG_WARN("failed to split file", K(ret)); + } else { + has_split = true; + } + } + if (OB_SUCC(ret)) { + if (has_lob_ && has_use_shared_buf) { + if (!has_compress_) { + data_writer.set_write_bytes(has_split ? 0 : curr_bytes); + } + data_writer.reset_curr_line_len(); + } else { + if (!has_compress_) { + data_writer.set_write_bytes(has_split ? curr_line_len : curr_bytes); + } + } + if (has_compress_ && has_split) { + data_writer.get_compress_stream_writer()->reuse(); + } + data_writer.update_last_line_pos(); + } + return ret; +} + +int ObSelectIntoOp::get_buf(char* &buf, int64_t &buf_len, int64_t &pos, ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + buf = use_shared_buf_ ? get_shared_buf() : data_writer.get_buf(); + buf_len = use_shared_buf_ ? get_shared_buf_len() : data_writer.get_buf_len(); + pos = data_writer.get_curr_pos(); + if (OB_ISNULL(buf) && !use_shared_buf_ && OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } else if (OB_ISNULL(buf)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("buf should not be null", K(ret)); + } + return ret; +} + +int ObSelectIntoOp::use_shared_buf(ObCsvFileWriter &data_writer, + char* &buf, + int64_t &buf_len, + int64_t &pos) +{ + int ret = OB_SUCCESS; + int64_t curr_pos = data_writer.get_curr_pos(); + if (!use_shared_buf_ && data_writer.get_last_line_pos() == 0) { + if (OB_NOT_NULL(data_writer.get_buf()) && curr_pos > 0) { + MEMCPY(shared_buf_, data_writer.get_buf(), curr_pos); + } + use_shared_buf_ = true; + buf = shared_buf_; + buf_len = shared_buf_len_; + pos = curr_pos; + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("last line should be flushed before this line copied", K(ret)); + } + return ret; +} + +int ObSelectIntoOp::resize_buf(char* &buf, + int64_t &buf_len, + int64_t &pos, + int64_t curr_pos, + bool is_json) +{ + int ret = OB_SUCCESS; + int64_t new_buf_len = buf_len * 2; + char* new_buf = NULL; + if (OB_ISNULL(new_buf = static_cast(ctx_.get_allocator().alloc(new_buf_len)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate buffer", K(ret), K(new_buf_len)); + } else if (!is_json) { + if (curr_pos > 0) { + MEMCPY(new_buf, shared_buf_, curr_pos); + } + shared_buf_ = new_buf; + shared_buf_len_ = new_buf_len; + } else { + json_buf_ = new_buf; + json_buf_len_ = new_buf_len; + } + if (OB_SUCC(ret)) { + buf = new_buf; + buf_len = new_buf_len; + pos = is_json ? 0 : curr_pos; + } + return ret; +} + +int ObSelectIntoOp::resize_or_flush_shared_buf(ObCsvFileWriter &data_writer, + char* &buf, + int64_t &buf_len, + int64_t &pos) +{ + int ret = OB_SUCCESS; + if (!use_shared_buf_) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get invalid argument", K(use_shared_buf_), K(ret)); + } else if (has_lob_ && data_writer.get_curr_pos() > 0) { + if (OB_FAIL(data_writer.flush_shared_buf(shared_buf_, true))) { + LOG_WARN("failed to flush shared buffer", K(ret)); + } else { + pos = 0; + } + } else if (OB_FAIL(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos()))) { + LOG_WARN("failed to resize shared buffer", K(ret)); + } + return ret; +} + +int ObSelectIntoOp::check_buf_sufficient(ObCsvFileWriter &data_writer, + char* &buf, + int64_t &buf_len, + int64_t &pos, + int64_t str_len) +{ + int ret = OB_SUCCESS; + if (buf_len < str_len * 1.1) { + if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + return ret; +} + +int ObSelectIntoOp::write_obj_to_file(const ObObj &obj, ObCsvFileWriter &data_writer, bool need_escape) +{ + int ret = OB_SUCCESS; + // binary collation do not require to escape when encode with base64/hex + if (obj.get_collation_type() == CS_TYPE_BINARY && + (print_params_.binary_string_print_hex_ || print_params_.binary_string_print_base64_)) { + need_escape = false; + } + + if ((obj.is_string_type() || obj.is_json() || obj.is_collection_sql_type()) && need_escape) { + if (OB_FAIL(print_str_or_json_with_escape(obj, data_writer))) { + LOG_WARN("failed to print str or json with escape", K(ret)); + } + } else if (OB_FAIL(print_normal_obj_without_escape(obj, data_writer))) { + LOG_WARN("failed to print normal obj without escape", K(ret)); + } + return ret; +} + +int ObSelectIntoOp::print_str_or_json_with_escape(const ObObj &obj, ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + char* buf = NULL; + int64_t buf_len = 0; + int64_t pos = 0; + ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); + ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); + escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type + || src_type == CHARSET_INVALID); + escape_printer_.need_enclose_ = has_enclose_ && !obj.is_null(); + escape_printer_.do_escape_ = true; + escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY + && print_params_.binary_string_print_hex_; + ObString str_to_escape; + ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); + common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); + if (OB_FAIL(get_buf(escape_printer_.buf_, escape_printer_.buf_len_, escape_printer_.pos_, data_writer))) { + LOG_WARN("failed to get buffer", K(ret)); + } else if (obj.is_json() || obj.is_collection_sql_type()) { + ObObj inrow_obj = obj; + if (obj.is_lob_storage() + && OB_FAIL(ObTextStringIter::convert_outrow_lob_to_inrow_templob(obj, inrow_obj, NULL, &temp_allocator))) { + LOG_WARN("failed to convert outrow lobs", K(ret), K(obj)); + } else if (obj.is_collection_sql_type()) { + ObSubSchemaValue sub_meta; + if (OB_FAIL((get_exec_ctx().get_sqludt_meta_by_subschema_id(obj.get_meta().get_subschema_id(), sub_meta)))) { + LOG_WARN("failed to get collection subschema", K(ret), K(obj.get_meta().get_subschema_id())); + } else { + print_params_.coll_meta_ = reinterpret_cast(sub_meta.value_); + } + } + if (OB_FAIL(ret)) { + } else if (OB_FAIL(print_json_to_json_buf(inrow_obj, buf, buf_len, pos, data_writer))) { + LOG_WARN("failed to print normal obj without escape", K(ret)); + } else { + str_to_escape.assign_ptr(buf, pos); + escape_printer_.do_encode_ = false; + } + } else { + str_to_escape = obj.get_varchar(); + } + if (OB_SUCC(ret) && !use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_, + str_to_escape.length()))) { + LOG_WARN("failed to check if buf is sufficient", K(ret)); + } + if (OB_SUCC(ret) && !use_shared_buf_) { + if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_))) { + if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); + } else if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { + } else if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_))) { + if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); + } else if (OB_FAIL(use_shared_buf(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + } + } + if (OB_SUCC(ret) && use_shared_buf_) { + do { + if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_))) { + LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); + } + } while (OB_SIZE_OVERFLOW == ret && OB_SUCC(resize_or_flush_shared_buf(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_))); + if (OB_FAIL(ret)) { + LOG_WARN("failed to print plain str", K(ret)); + } + } + if (OB_SUCC(ret)) { + data_writer.set_curr_pos(escape_printer_.pos_); + } + + return ret; +} + +int ObSelectIntoOp::print_normal_obj_without_escape(const ObObj &obj, ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + char* buf = NULL; + int64_t buf_len = 0; + int64_t pos = 0; + OZ(get_buf(buf, buf_len, pos, data_writer)); + if (OB_SUCC(ret) && !use_shared_buf_) { + if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { + if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print obj", K(ret)); + } else if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FALSE_IT(pos = data_writer.get_curr_pos())) { + } else if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { + if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print obj", K(ret)); + } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + } + } + if (OB_SUCC(ret) && use_shared_buf_) { + do { + if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { + LOG_WARN("failed to print obj", K(ret)); + } + } while (OB_SIZE_OVERFLOW == ret + && OB_SUCC(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))); + if (OB_FAIL(ret)) { + LOG_WARN("failed to print obj", K(ret)); + } + } + if (OB_SUCC(ret)) { + data_writer.set_curr_pos(pos); + } + return ret; +} + +int ObSelectIntoOp::print_json_to_json_buf(const ObObj &obj, char* &buf, int64_t &buf_len, int64_t &pos, - int64_t str_len) - { - int ret = OB_SUCCESS; - if (buf_len < str_len * 1.1) { - if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - return ret; - } - - int ObSelectIntoOp::write_obj_to_file(const ObObj &obj, ObCsvFileWriter &data_writer, bool need_escape) - { - int ret = OB_SUCCESS; - // binary collation do not require to escape when encode with base64/hex - if (obj.get_collation_type() == CS_TYPE_BINARY && - (print_params_.binary_string_print_hex_ || print_params_.binary_string_print_base64_)) { - need_escape = false; - } - - if ((obj.is_string_type() || obj.is_json() || obj.is_collection_sql_type()) && need_escape) { - if (OB_FAIL(print_str_or_json_with_escape(obj, data_writer))) { - LOG_WARN("failed to print str or json with escape", K(ret)); - } - } else if (OB_FAIL(print_normal_obj_without_escape(obj, data_writer))) { - LOG_WARN("failed to print normal obj without escape", K(ret)); - } - return ret; - } - - int ObSelectIntoOp::print_str_or_json_with_escape(const ObObj &obj, ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - char* buf = NULL; - int64_t buf_len = 0; - int64_t pos = 0; - ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); - ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); - escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type - || src_type == CHARSET_INVALID); - escape_printer_.need_enclose_ = has_enclose_ && !obj.is_null(); - escape_printer_.do_escape_ = true; - escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY - && print_params_.binary_string_print_hex_; - ObString str_to_escape; - ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); - common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); - if (OB_FAIL(get_buf(escape_printer_.buf_, escape_printer_.buf_len_, escape_printer_.pos_, data_writer))) { - LOG_WARN("failed to get buffer", K(ret)); - } else if (obj.is_json() || obj.is_collection_sql_type()) { - ObObj inrow_obj = obj; - if (obj.is_lob_storage() - && OB_FAIL(ObTextStringIter::convert_outrow_lob_to_inrow_templob(obj, inrow_obj, NULL, &temp_allocator))) { - LOG_WARN("failed to convert outrow lobs", K(ret), K(obj)); - } else if (obj.is_collection_sql_type()) { - ObSubSchemaValue sub_meta; - if (OB_FAIL((get_exec_ctx().get_sqludt_meta_by_subschema_id(obj.get_meta().get_subschema_id(), sub_meta)))) { - LOG_WARN("failed to get collection subschema", K(ret), K(obj.get_meta().get_subschema_id())); - } else { - print_params_.coll_meta_ = reinterpret_cast(sub_meta.value_); - } - } - if (OB_FAIL(ret)) { - } else if (OB_FAIL(print_json_to_json_buf(inrow_obj, buf, buf_len, pos, data_writer))) { - LOG_WARN("failed to print normal obj without escape", K(ret)); - } else { - str_to_escape.assign_ptr(buf, pos); - escape_printer_.do_encode_ = false; - } - } else { - str_to_escape = obj.get_varchar(); - } - if (OB_SUCC(ret) && !use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_, - str_to_escape.length()))) { - LOG_WARN("failed to check if buf is sufficient", K(ret)); - } - if (OB_SUCC(ret) && !use_shared_buf_) { - if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_))) { - if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); - } else if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { - } else if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, + ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + buf = get_json_buf(); + buf_len = get_json_buf_len(); + pos = 0; + do { + if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { + LOG_WARN("failed to print obj", K(ret)); + } + } while (OB_SIZE_OVERFLOW == ret + && OB_SUCC(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos(), true))); + if (OB_FAIL(ret)) { + LOG_WARN("failed to print json to json buffer", K(ret)); + } + return ret; +} + +int ObSelectIntoOp::write_lob_to_file(const ObObj &obj, + const ObExpr &expr, + const ObDatum &datum, + ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); + ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); + escape_printer_.need_enclose_ = has_enclose_; + escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type + || src_type == CHARSET_INVALID); + escape_printer_.do_escape_ = has_escape_; + escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY + && print_params_.binary_string_print_hex_; + ObDatumMeta input_meta = expr.datum_meta_; + ObTextStringIterState state; + ObString src_block_data; + ObTextStringIter lob_iter(input_meta.type_, input_meta.cs_type_, datum.get_string(), + expr.obj_meta_.has_lob_header()); + ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); + common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); + int64_t truncated_len = 0; + bool stop_when_truncated = false; + OZ(lob_iter.init(0, NULL, &temp_allocator)); + OZ(get_buf(escape_printer_.buf_, escape_printer_.buf_len_, escape_printer_.pos_, data_writer)); + // When truncated_len == src_block_data.length() when truncated length equals source block data length + // Indicates that the current foreach_char is processing only invalid data at the end of the lob, i.e., truncated data from the previous round, to avoid infinite loops + while (OB_SUCC(ret) + && (state = lob_iter.get_next_block(src_block_data)) == TEXTSTRING_ITER_NEXT) { + // outrow lob will only be false on the last iteration, inrow lob iterates only once, and is false + stop_when_truncated = (truncated_len != src_block_data.length()) && lob_iter.is_outrow_lob(); + if (!use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_, + src_block_data.length()))) { + LOG_WARN("failed to check if buf is sufficient", K(ret)); + } + if (OB_SUCC(ret) && !use_shared_buf_) { + if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { + if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { + lob_iter.set_reserved_byte_len(truncated_len); + ret = OB_SUCCESS; + } else if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print lob", K(ret)); + } else if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { + } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { + if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { + lob_iter.set_reserved_byte_len(truncated_len); + ret = OB_SUCCESS; + } else if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print lob", K(ret)); + } else if (OB_FAIL(use_shared_buf(data_writer, + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + } + } + if (OB_SUCC(ret) && use_shared_buf_) { + if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { + if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { + lob_iter.set_reserved_byte_len(truncated_len); + ret = OB_SUCCESS; + } else if (OB_SIZE_OVERFLOW != ret) { + LOG_WARN("failed to print lob", K(ret)); + } else if (OB_FAIL(data_writer.flush_shared_buf(shared_buf_, true))) { + LOG_WARN("failed to flush shared buffer", K(ret)); + } else if (OB_FALSE_IT(escape_printer_.pos_ = 0)) { + } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, src_type, escape_printer_, escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_))) { - if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); - } else if (OB_FAIL(use_shared_buf(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - } - } - if (OB_SUCC(ret) && use_shared_buf_) { - do { - if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_))) { - LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); - } - } while (OB_SIZE_OVERFLOW == ret && OB_SUCC(resize_or_flush_shared_buf(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_))); - if (OB_FAIL(ret)) { - LOG_WARN("failed to print plain str", K(ret)); - } - } - if (OB_SUCC(ret)) { - data_writer.set_curr_pos(escape_printer_.pos_); - } - - return ret; - } - - int ObSelectIntoOp::print_normal_obj_without_escape(const ObObj &obj, ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - char* buf = NULL; - int64_t buf_len = 0; - int64_t pos = 0; - OZ(get_buf(buf, buf_len, pos, data_writer)); - if (OB_SUCC(ret) && !use_shared_buf_) { - if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { - if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print obj", K(ret)); - } else if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FALSE_IT(pos = data_writer.get_curr_pos())) { - } else if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { - if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print obj", K(ret)); - } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - } - } - if (OB_SUCC(ret) && use_shared_buf_) { - do { - if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { - LOG_WARN("failed to print obj", K(ret)); - } - } while (OB_SIZE_OVERFLOW == ret - && OB_SUCC(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))); - if (OB_FAIL(ret)) { - LOG_WARN("failed to print obj", K(ret)); - } - } - if (OB_SUCC(ret)) { - data_writer.set_curr_pos(pos); - } - return ret; - } - - int ObSelectIntoOp::print_json_to_json_buf(const ObObj &obj, - char* &buf, - int64_t &buf_len, - int64_t &pos, - ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - buf = get_json_buf(); - buf_len = get_json_buf_len(); - pos = 0; - do { - if (OB_FAIL(obj.print_plain_str_literal(buf, buf_len, pos, print_params_))) { - LOG_WARN("failed to print obj", K(ret)); - } - } while (OB_SIZE_OVERFLOW == ret - && OB_SUCC(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos(), true))); - if (OB_FAIL(ret)) { - LOG_WARN("failed to print json to json buffer", K(ret)); - } - return ret; - } - - int ObSelectIntoOp::write_lob_to_file(const ObObj &obj, - const ObExpr &expr, - const ObDatum &datum, - ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); - ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); - escape_printer_.need_enclose_ = has_enclose_; - escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type - || src_type == CHARSET_INVALID); - escape_printer_.do_escape_ = has_escape_; - escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY - && print_params_.binary_string_print_hex_; - ObDatumMeta input_meta = expr.datum_meta_; - ObTextStringIterState state; - ObString src_block_data; - ObTextStringIter lob_iter(input_meta.type_, input_meta.cs_type_, datum.get_string(), - expr.obj_meta_.has_lob_header()); - ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); - common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); - int64_t truncated_len = 0; - bool stop_when_truncated = false; - OZ(lob_iter.init(0, NULL, &temp_allocator)); - OZ(get_buf(escape_printer_.buf_, escape_printer_.buf_len_, escape_printer_.pos_, data_writer)); - // When truncated_len == src_block_data.length() when truncated length equals source block data length - // Indicates that the current foreach_char is processing only invalid data at the end of the lob, i.e., truncated data from the previous round, to avoid infinite loops - while (OB_SUCC(ret) - && (state = lob_iter.get_next_block(src_block_data)) == TEXTSTRING_ITER_NEXT) { - // outrow lob will only be false on the last iteration, inrow lob iterates only once, and is false - stop_when_truncated = (truncated_len != src_block_data.length()) && lob_iter.is_outrow_lob(); - if (!use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_, - src_block_data.length()))) { - LOG_WARN("failed to check if buf is sufficient", K(ret)); - } - if (OB_SUCC(ret) && !use_shared_buf_) { - if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { - if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { - lob_iter.set_reserved_byte_len(truncated_len); - ret = OB_SUCCESS; - } else if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print lob", K(ret)); - } else if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { - } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { - if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { - lob_iter.set_reserved_byte_len(truncated_len); - ret = OB_SUCCESS; - } else if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print lob", K(ret)); - } else if (OB_FAIL(use_shared_buf(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - } - } - if (OB_SUCC(ret) && use_shared_buf_) { - if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { - if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { - lob_iter.set_reserved_byte_len(truncated_len); - ret = OB_SUCCESS; - } else if (OB_SIZE_OVERFLOW != ret) { - LOG_WARN("failed to print lob", K(ret)); - } else if (OB_FAIL(data_writer.flush_shared_buf(shared_buf_, true))) { - LOG_WARN("failed to flush shared buffer", K(ret)); - } else if (OB_FALSE_IT(escape_printer_.pos_ = 0)) { - } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { - if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { - lob_iter.set_reserved_byte_len(truncated_len); - ret = OB_SUCCESS; - } else { - LOG_WARN("failed to print lob", K(ret), K(src_block_data.length()), K(shared_buf_len_), - K(data_writer.get_curr_pos()), K(escape_printer_.buf_len_), K(escape_printer_.pos_)); - } - } - } - } - data_writer.set_curr_pos(escape_printer_.pos_); - } - if (OB_FAIL(ret)) { - } else if (state != TEXTSTRING_ITER_NEXT && state != TEXTSTRING_ITER_END) { - ret = (lob_iter.get_inner_ret() != OB_SUCCESS) ? - lob_iter.get_inner_ret() : OB_INVALID_DATA; - LOG_WARN("iter state invalid", K(ret), K(state), K(lob_iter)); - } - return ret; - } - - int ObSelectIntoOp::write_single_char_to_file(const char *wchar, ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - char* buf = NULL; - int64_t buf_len = 0; - int64_t pos = 0; - OZ(get_buf(buf, buf_len, pos, data_writer)); - if (OB_SUCC(ret) && !use_shared_buf_) { - if (pos < buf_len) { - MEMCPY(buf + pos, wchar, 1); - data_writer.set_curr_pos(pos + 1); - } else if (OB_FAIL(data_writer.flush_buf())) { - LOG_WARN("failed to flush buffer", K(ret)); - } else if (OB_FALSE_IT(pos = data_writer.get_curr_pos())) { - } else if (pos < buf_len) { - MEMCPY(buf + pos, wchar, 1); - data_writer.set_curr_pos(pos + 1); - } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to use shared buffer", K(ret)); - } - } - if (OB_SUCC(ret) && use_shared_buf_) { - if (pos < buf_len) { - MEMCPY(buf + pos, wchar, 1); - data_writer.set_curr_pos(pos + 1); - } else if (OB_FAIL(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))) { - LOG_WARN("failed to resize or flush shared buffer", K(ret)); - } else if (pos < buf_len) { - MEMCPY(buf + pos, wchar, 1); - data_writer.set_curr_pos(pos + 1); - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected error", K(ret)); - } - } - return ret; - } - - int ObSelectIntoOp::print_lob_field(const ObObj &obj, - const ObExpr &expr, - const ObDatum &datum, - ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - if (has_enclose_) { - OZ(write_single_char_to_file(&char_enclose_, data_writer)); - } - OZ(write_lob_to_file(obj, expr, datum, data_writer)); - if (has_enclose_) { - OZ(write_single_char_to_file(&char_enclose_, data_writer)); - } - return ret; - } - - int ObSelectIntoOp::print_field(const ObObj &obj, ObCsvFileWriter &data_writer) - { - int ret = OB_SUCCESS; - char char_n = 'N'; - const bool need_enclose = has_enclose_ && !obj.is_null() - && (!is_optional_ || obj.is_string_type() || obj.is_collection_sql_type() - || obj.is_json() || obj.is_geometry() || obj.is_date() - || obj.is_time() || obj.is_timestamp() || obj.is_datetime() - || obj.is_mysql_date() || obj.is_mysql_datetime()); - if (need_enclose) { - OZ(write_single_char_to_file(&char_enclose_, data_writer)); - } - if (!has_escape_) { - OZ(write_obj_to_file(obj, data_writer, false)); - } else if (obj.is_null()) { - OZ(write_single_char_to_file(&char_escape_, data_writer)); - OZ(write_single_char_to_file(&char_n, data_writer)); - } else { - OZ(write_obj_to_file(obj, data_writer, true)); - } - if (need_enclose) { - OZ(write_single_char_to_file(&char_enclose_, data_writer)); - } - return ret; - } - - int ObSelectIntoOp::into_outfile(ObExternalFileWriter *data_writer) - { - int ret = OB_SUCCESS; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - ObDatum *datum = NULL; - ObObj obj; - ObDatum *partition_datum = NULL; - ObCsvFileWriter *csv_data_writer = NULL; - if (do_partition_) { - if (OB_FAIL(MY_SPEC.file_partition_expr_->eval(eval_ctx_, partition_datum))) { - LOG_WARN("eval expr failed", K(ret)); - } else if (OB_ISNULL(partition_datum)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (OB_FAIL(get_data_writer_for_partition(partition_datum->get_string(), data_writer))) { - LOG_WARN("failed to set data writer for partition", K(ret)); - } - } - if (OB_SUCC(ret)) { - if (OB_ISNULL(csv_data_writer = static_cast(data_writer))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null data writer", K(ret)); - } - } - for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { - if (OB_ISNULL(select_exprs.at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("select expr is unexpected null", K(ret)); - } else if (OB_FAIL(select_exprs.at(i)->eval(eval_ctx_, datum))) { - LOG_WARN("eval expr failed", K(ret)); - } else if (OB_ISNULL(datum)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("datum is unexpected null", K(ret)); - } else if (OB_FAIL(datum->to_obj(obj, - select_exprs.at(i)->obj_meta_, - select_exprs.at(i)->obj_datum_map_))) { - LOG_WARN("failed to get obj from datum", K(ret)); - } else if (!ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type()) || obj.is_null()) { - OZ(print_field(obj, *csv_data_writer)); - } else { // text tc - OZ(print_lob_field(obj, *select_exprs.at(i), *datum, *csv_data_writer)); - } - // print field terminator - if (OB_SUCC(ret) && i != select_exprs.count() - 1) { - OZ(write_obj_to_file(field_str_, *csv_data_writer)); - } - } - // print line terminator - OZ(write_obj_to_file(line_str_, *csv_data_writer)); - // check if need split file - OZ(check_csv_file_size(*csv_data_writer)); - // clear shared buffer - OZ(csv_data_writer->flush_shared_buf(shared_buf_)); - if (has_compress_) { - OZ(csv_data_writer->flush_buf()); - } - return ret; - } - - static OB_INLINE int get_cast_ret(const bool is_strict_mode, int ret) - { - if (OB_SUCCESS != ret && !is_strict_mode) { - ret = OB_SUCCESS; - } - return ret; - } - - #define CAST_FAIL(stmt) \ - (OB_UNLIKELY((OB_SUCCESS != (ret = get_cast_ret((is_strict_mode), (stmt)))))) - - - int ObSelectIntoOp::decimal_to_string(const ObDatum &datum, - const ObDatumMeta &datum_meta, - std::string &res, - ObIAllocator &allocator) - { - int ret = OB_SUCCESS; - char *buf = NULL; - int64_t pos = 0; - if (OB_ISNULL(buf = static_cast(allocator.alloc(OB_CAST_TO_VARCHAR_MAX_LENGTH)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to alloc memory", K(ret)); - } else if (OB_FAIL(wide::to_string(datum.get_decimal_int(), datum.get_int_bytes(), datum_meta.scale_, - buf, OB_CAST_TO_VARCHAR_MAX_LENGTH, pos))) { - LOG_WARN("failed to get string", K(ret)); - } else { - res.assign(buf, pos); - } - return ret; - } - - - int ObSelectIntoOp::into_outfile_batch_csv(const ObBatchRows &brs, ObExternalFileWriter *data_writer) - { - int ret = OB_SUCCESS; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - ObArray datum_vectors; - ObDatum *datum = NULL; - ObObj obj; - ObDatumVector partition_datum_vector; - ObCsvFileWriter *csv_data_writer = NULL; - for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { - if (OB_FAIL(select_exprs.at(i)->eval_batch(eval_ctx_, *brs.skip_, brs.size_))) { - LOG_WARN("failed to eval batch", K(ret)); - } else if (OB_FAIL(datum_vectors.push_back(select_exprs.at(i)->locate_expr_datumvector(eval_ctx_)))) { - LOG_WARN("failed to push back datum vector", K(ret)); - } - } - - if (OB_SUCC(ret) && do_partition_) { - if (OB_FAIL(MY_SPEC.file_partition_expr_->eval_batch(eval_ctx_, *brs.skip_, brs.size_))) { - LOG_WARN("failed to eval batch", K(ret)); - } else { - partition_datum_vector = MY_SPEC.file_partition_expr_->locate_expr_datumvector(eval_ctx_); - } - } - for (int64_t i = 0; OB_SUCC(ret) && i < brs.size_; ++i) { - if (brs.skip_->contain(i)) { - // do nothing - } else if (do_partition_ && OB_ISNULL(partition_datum_vector.at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (do_partition_ && OB_FAIL(get_data_writer_for_partition(partition_datum_vector.at(i)->get_string(), - data_writer))) { - LOG_WARN("failed to set data writer for partition", K(ret)); - } else if (OB_ISNULL(csv_data_writer = static_cast(data_writer))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null data writer", K(ret)); - } else if (has_compress_ && OB_ISNULL(csv_data_writer->get_compress_stream_writer()) - && OB_FAIL(csv_data_writer->init_compress_writer(ctx_.get_allocator(), - external_properties_.csv_format_.compression_algorithm_, - MY_SPEC.buffer_size_))) { - LOG_WARN("failed to init compress stream writer", K(ret)); - } else { - for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); ++col_idx) { - if (OB_ISNULL(datum = datum_vectors.at(col_idx).at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("datum is unexpected null", K(ret)); - } else if (OB_FAIL(datum->to_obj(obj, - select_exprs.at(col_idx)->obj_meta_, - select_exprs.at(col_idx)->obj_datum_map_))) { - LOG_WARN("failed to get obj from datum", K(ret)); - } else if (!ob_is_text_tc(select_exprs.at(col_idx)->obj_meta_.get_type()) || obj.is_null()) { - OZ(print_field(obj, *csv_data_writer)); - } else { // text tc - OZ(print_lob_field(obj, *select_exprs.at(col_idx), *datum, *csv_data_writer)); - } - // print field terminator - if (OB_SUCC(ret) && col_idx != select_exprs.count() - 1) { - OZ(write_obj_to_file(field_str_, *csv_data_writer)); - } - } - // print line terminator - OZ(write_obj_to_file(line_str_, *csv_data_writer)); - // check if need split file - OZ(check_csv_file_size(*csv_data_writer)); - // clear shared buffer - OZ(csv_data_writer->flush_shared_buf(shared_buf_)); - if (has_compress_) { - OZ(csv_data_writer->flush_buf()); - } - } - } - return ret; - } - - int ObSelectIntoOp::get_data_from_expr_vector(const common::ObIVector* expr_vector, - int row_idx, - ObObjType type, - int64_t &value, - const bool is_strict_mode, - const ObDateSqlMode date_sql_mode) - { - int ret = OB_SUCCESS; - int32_t date; - switch(type) { - case ObTinyIntType: - value = expr_vector->get_tinyint(row_idx); - break; - case ObSmallIntType: - value = expr_vector->get_smallint(row_idx); - break; - case ObMediumIntType: - value = expr_vector->get_mediumint(row_idx); - break; - case ObInt32Type: - value = expr_vector->get_int32(row_idx); - break; - case ObIntType: - value = expr_vector->get_int(row_idx); - break; - case ObYearType: - value = expr_vector->get_year(row_idx); - break; - case ObDateType: - value = expr_vector->get_date(row_idx); - break; - case ObMySQLDateType: - CAST_FAIL( - ObTimeConverter::mdate_to_date(expr_vector->get_mysql_date(row_idx), date, date_sql_mode)); - value = date; - break; - case ObMySQLDateTimeType: - CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(expr_vector->get_mysql_datetime(row_idx), value, - date_sql_mode)); - break; - default: - ret = OB_OBJ_TYPE_ERROR; - } - return ret; - } - - bool ObSelectIntoOp::file_need_split(int64_t file_size) - { - return (file_location_ == IntoFileLocation::SERVER_DISK - && !MY_SPEC.is_single_ && file_size > MY_SPEC.max_file_size_) - || (file_location_ != IntoFileLocation::SERVER_DISK - && ((!MY_SPEC.is_single_ && file_size > min(MY_SPEC.max_file_size_, MAX_OSS_FILE_SIZE)) - || (MY_SPEC.is_single_ && file_size > MAX_OSS_FILE_SIZE))); - } - - int ObSelectIntoOp::check_oracle_number(ObObjType obj_type, int16_t &precision, int8_t scale) - { - int ret = OB_SUCCESS; - return ret; - } - - int ObSelectIntoOp::calc_byte_array(const common::ObIVector* expr_vector, - int row_idx, - const ObDatumMeta &datum_meta, - const ObObjMeta &obj_meta, - ObIAllocator &allocator, - char* &buf, - uint32_t &res_len) - { - int ret = OB_SUCCESS; - ObString ob_str; - ObString res_str; - bool has_lob_header = obj_meta.has_lob_header(); - res_len = 0; - buf = nullptr; - int64_t buf_size = 0; - if (OB_FAIL(ObTextStringHelper::read_real_string_data(allocator, expr_vector, datum_meta, - has_lob_header, ob_str, row_idx))) { - LOG_WARN("failed to get string", K(ret)); - } else if (ob_str.length() == 0 || CS_TYPE_BINARY == datum_meta.cs_type_ - || CHARSET_UTF8MB4 == ObCharset::charset_type_by_coll(datum_meta.cs_type_)) { - if (OB_FAIL(ob_write_string(allocator, ob_str, res_str))) { - LOG_WARN("failed to write string", K(ret)); - } else { - res_len = static_cast(res_str.length()); - buf = const_cast(res_str.ptr()); - } - } else if (OB_FALSE_IT(buf_size = ob_str.length() * ObCharset::MAX_MB_LEN)) { - } else if (OB_ISNULL(buf = static_cast(allocator.alloc(buf_size)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to alloc memory", K(ret), K(buf_size)); - } else if (OB_FAIL(ObCharset::charset_convert(datum_meta.cs_type_, ob_str.ptr(), - ob_str.length(), CS_TYPE_UTF8MB4_BIN, - buf, buf_size, res_len, false, false))) { - LOG_WARN("failed to convert charset", K(ret)); - } - return ret; - } - - #ifndef OB_BUILD_EMBED_MODE - int ObSelectIntoOp::init_parquet_env() - { - int ret = OB_SUCCESS; - arrow_alloc_.init(MTL_ID()); - if (OB_FAIL(setup_parquet_schema())) { - LOG_WARN("failed to set up parquet schema", K(ret)); - } else if (OB_FAIL(init_env_common())) { - LOG_WARN("failed to init env common", K(ret)); - } - return ret; - } - - int ObSelectIntoOp::get_parquet_logical_type(std::shared_ptr &logical_type, - const ObObjType &obj_type, - const int32_t precision, - const int32_t scale) - { - int ret = OB_SUCCESS; - if (ObTinyIntType == obj_type) { - logical_type = parquet::LogicalType::Int(8, true); - } else if (ObSmallIntType == obj_type) { - logical_type = parquet::LogicalType::Int(16, true); - } else if (ObMediumIntType == obj_type || ObInt32Type == obj_type) { - logical_type = parquet::LogicalType::Int(32, true); - } else if (ObIntType == obj_type) { - logical_type = parquet::LogicalType::Int(64, true); - } else if (ObUTinyIntType == obj_type) { - logical_type = parquet::LogicalType::Int(8, false); - } else if (ObUSmallIntType == obj_type) { - logical_type = parquet::LogicalType::Int(16, false); - } else if (ObUMediumIntType == obj_type || ObUInt32Type == obj_type) { - logical_type = parquet::LogicalType::Int(32, false); - } else if (ObUInt64Type == obj_type) { - logical_type = parquet::LogicalType::Int(64, false); - } else if (ob_is_float_tc(obj_type) || ob_is_double_tc(obj_type)) { // float, ufloat, double, udouble - logical_type = parquet::LogicalType::None(); - } else if (ob_is_number_or_decimal_int_tc(obj_type)) { - logical_type = parquet::LogicalType::Decimal(precision, scale); - } else if (ob_is_datetime_or_mysql_datetime(obj_type)) { - logical_type = parquet::LogicalType::Timestamp(false, parquet::LogicalType::TimeUnit::MICROS); - } else if (ObTimestampType == obj_type) { - logical_type = parquet::LogicalType::Timestamp(true, parquet::LogicalType::TimeUnit::MICROS); - } else if (ob_is_date_or_mysql_date(obj_type)) { - logical_type = parquet::LogicalType::Date(); - } else if (ob_is_time_tc(obj_type)) { - logical_type = parquet::LogicalType::Time(false, parquet::LogicalType::TimeUnit::MICROS); - } else if (ob_is_year_tc(obj_type)) { - logical_type = parquet::LogicalType::Int(8, false); - } else if (ob_is_string_type(obj_type) || ObNullType == obj_type) { - logical_type = parquet::LogicalType::String(); - } else if (ob_is_bit_tc(obj_type) /*uint64_t*/) { - logical_type = parquet::LogicalType::Int(64, false); - } else if (ob_is_enum_or_set_type(obj_type) /*uint64_t*/) { - logical_type = parquet::LogicalType::Enum(); - } else { - // TODO(bitao): support json/bson/uuid/map/list - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "convert this ob type to parquet type"); - LOG_WARN("unsupported obj type", K(ret), K(obj_type)); - } - return ret; - } - - int ObSelectIntoOp::get_parquet_physical_type(parquet::Type::type &physical_type, - const ObObjType &obj_type) - { - int ret = OB_SUCCESS; - if (ObTinyIntType == obj_type || ObSmallIntType == obj_type - || ObMediumIntType == obj_type || ObInt32Type == obj_type - || ObUTinyIntType == obj_type || ObUSmallIntType == obj_type - || ObUMediumIntType == obj_type || ObUInt32Type == obj_type - || ob_is_date_or_mysql_date(obj_type) || ob_is_year_tc(obj_type)) { - physical_type = parquet::Type::INT32; - } else if (ObIntType == obj_type || ObUInt64Type == obj_type - || ob_is_datetime_or_mysql_datetime_tc(obj_type) - || ob_is_time_tc(obj_type) || ob_is_bit_tc(obj_type)) { - physical_type = parquet::Type::INT64; - } else if (ob_is_float_tc(obj_type)) { // float, ufloat - physical_type = parquet::Type::FLOAT; - } else if (ob_is_double_tc(obj_type)) { // double, udouble - physical_type = parquet::Type::DOUBLE; - } else if (ob_is_number_or_decimal_int_tc(obj_type)) { - physical_type = parquet::Type::FIXED_LEN_BYTE_ARRAY; - } else if (ob_is_string_tc(obj_type) /*varchar,char,varbinary,binary*/ - || ob_is_text_tc(obj_type) /*TinyText,MediumText,Text,LongText,TinyBLOB,MediumBLOB,BLOB,LongBLOB*/ - || ob_is_enum_or_set_type(obj_type) - || ObNullType == obj_type) { - physical_type = parquet::Type::BYTE_ARRAY; - } else { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "convert this ob type to parquet type"); - LOG_WARN("unsupported obj type", K(ret), K(obj_type)); - } - return ret; - } - - int ObSelectIntoOp::calc_parquet_decimal_length(int precision) - { - // Put in utils? - return std::ceil((1 + precision / std::log10(2)) / 8); - } - - int ObSelectIntoOp::setup_parquet_schema() - { - int ret = OB_SUCCESS; - ObMallocHookAttrGuard guard(ObMemAttr(MTL_ID(), "IntoParquet")); - parquet::schema::NodeVector fields; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - std::shared_ptr logical_type; - parquet::Type::type physical_type; - parquet::schema::NodePtr node; - try { - for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { - ObDatumMeta meta = select_exprs.at(i)->datum_meta_; - ObObjType obj_type = meta.get_type(); - ObString alias_name = MY_SPEC.alias_names_.strs_.at(i); - std::string column_name(alias_name.ptr(), alias_name.length()); - int primitive_length = -1; - if (OB_FAIL(check_oracle_number(obj_type, - select_exprs.at(i)->datum_meta_.precision_, - select_exprs.at(i)->datum_meta_.scale_))) { - LOG_WARN("not support number type", K(ret)); - } else if (OB_FAIL(get_parquet_logical_type(logical_type, - obj_type, - select_exprs.at(i)->datum_meta_.precision_, - select_exprs.at(i)->datum_meta_.scale_))) { - LOG_WARN("failed to get related logical type", K(ret)); - } else if (OB_FAIL(get_parquet_physical_type(physical_type, obj_type))) { - LOG_WARN("failed to get related physical type", K(ret)); - } else if (ob_is_number_or_decimal_int_tc(obj_type) - && OB_FALSE_IT(primitive_length = calc_parquet_decimal_length( - select_exprs.at(i)->datum_meta_.precision_))) { - } else { - //todo@linyi repetition level - node = parquet::schema::PrimitiveNode::Make(column_name, parquet::Repetition::OPTIONAL, - logical_type, physical_type, primitive_length); - fields.push_back(node); - } - } - if (OB_SUCC(ret)) { - parquet_writer_schema_ = std::static_pointer_cast( - parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); - } - } catch (const std::exception& ex) { - if (OB_SUCC(ret)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("caught exception when setup parquet schema", K(ret), "Info", ex.what()); - LOG_USER_ERROR(OB_ERR_UNEXPECTED, ex.what()); - } - } catch (...) { - if (OB_SUCC(ret)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("caught exception when setup parquet schema", K(ret)); - } - } - return ret; - } - - int ObSelectIntoOp::into_outfile_batch_parquet(const ObBatchRows &brs, ObExternalFileWriter *data_writer) - { - int ret = OB_SUCCESS; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - ObArray expr_vectors; - common::ObIVector* partition_vector; - int64_t estimated_bytes = 0; - int64_t row_group_size = 0; - int64_t file_size = 0; - ObParquetFileWriter *parquet_data_writer = NULL; - ObSQLMode sql_mode = eval_ctx_.exec_ctx_.get_my_session()->get_sql_mode(); - ObDateSqlMode date_sql_mode; - date_sql_mode.init(sql_mode); - bool is_strict_mode = common::is_strict_mode(sql_mode); - for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { - if (OB_ISNULL(select_exprs.at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (OB_FAIL(select_exprs.at(i)->eval_vector(eval_ctx_, brs))) { - LOG_WARN("failed to eval vector", K(ret)); - } else if (OB_FAIL(expr_vectors.push_back(select_exprs.at(i)->get_vector(eval_ctx_)))) { - LOG_WARN("failed to push back vector", K(ret)); - } - } - if (OB_SUCC(ret) && do_partition_) { - if (OB_FAIL(MY_SPEC.file_partition_expr_->eval_vector(eval_ctx_, brs))) { - LOG_WARN("failed to eval batch", K(ret)); - } else if (OB_ISNULL(partition_vector = MY_SPEC.file_partition_expr_->get_vector(eval_ctx_))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null vector", K(ret)); - } - } - for (int64_t row_idx = 0; OB_SUCC(ret) && row_idx < brs.size_; ++row_idx) { - if (brs.skip_->contain(row_idx)) { - // do nothing - } else if (do_partition_ && OB_FAIL(get_data_writer_for_partition(partition_vector->get_string(row_idx), - data_writer))) { - LOG_WARN("failed to set data writer for partition", K(ret)); - } else if (OB_ISNULL(parquet_data_writer = static_cast(data_writer))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null data writer", K(ret)); - } else if (parquet_data_writer->is_file_writer_null() - && OB_FAIL(parquet_data_writer->open_parquet_file_writer(arrow_alloc_, - external_properties_.parquet_format_.row_group_size_, - external_properties_.parquet_format_.compress_type_index_, - brs.size_, - ctx_.get_allocator()))) { - LOG_WARN("failed to init parquet file writer", K(ret)); - } else if (!parquet_data_writer->is_valid_to_write()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else { - try { - for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); col_idx++) { - if (OB_FAIL(build_parquet_cell(parquet_data_writer->get_row_group_writer(), - select_exprs.at(col_idx)->datum_meta_, - select_exprs.at(col_idx)->obj_meta_, - expr_vectors.at(col_idx), - col_idx, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { + if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { + lob_iter.set_reserved_byte_len(truncated_len); + ret = OB_SUCCESS; + } else { + LOG_WARN("failed to print lob", K(ret), K(src_block_data.length()), K(shared_buf_len_), + K(data_writer.get_curr_pos()), K(escape_printer_.buf_len_), K(escape_printer_.pos_)); + } + } + } + } + data_writer.set_curr_pos(escape_printer_.pos_); + } + if (OB_FAIL(ret)) { + } else if (state != TEXTSTRING_ITER_NEXT && state != TEXTSTRING_ITER_END) { + ret = (lob_iter.get_inner_ret() != OB_SUCCESS) ? + lob_iter.get_inner_ret() : OB_INVALID_DATA; + LOG_WARN("iter state invalid", K(ret), K(state), K(lob_iter)); + } + return ret; +} + +int ObSelectIntoOp::write_single_char_to_file(const char *wchar, ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + char* buf = NULL; + int64_t buf_len = 0; + int64_t pos = 0; + OZ(get_buf(buf, buf_len, pos, data_writer)); + if (OB_SUCC(ret) && !use_shared_buf_) { + if (pos < buf_len) { + MEMCPY(buf + pos, wchar, 1); + data_writer.set_curr_pos(pos + 1); + } else if (OB_FAIL(data_writer.flush_buf())) { + LOG_WARN("failed to flush buffer", K(ret)); + } else if (OB_FALSE_IT(pos = data_writer.get_curr_pos())) { + } else if (pos < buf_len) { + MEMCPY(buf + pos, wchar, 1); + data_writer.set_curr_pos(pos + 1); + } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to use shared buffer", K(ret)); + } + } + if (OB_SUCC(ret) && use_shared_buf_) { + if (pos < buf_len) { + MEMCPY(buf + pos, wchar, 1); + data_writer.set_curr_pos(pos + 1); + } else if (OB_FAIL(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))) { + LOG_WARN("failed to resize or flush shared buffer", K(ret)); + } else if (pos < buf_len) { + MEMCPY(buf + pos, wchar, 1); + data_writer.set_curr_pos(pos + 1); + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected error", K(ret)); + } + } + return ret; +} + +int ObSelectIntoOp::print_lob_field(const ObObj &obj, + const ObExpr &expr, + const ObDatum &datum, + ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + if (has_enclose_) { + OZ(write_single_char_to_file(&char_enclose_, data_writer)); + } + OZ(write_lob_to_file(obj, expr, datum, data_writer)); + if (has_enclose_) { + OZ(write_single_char_to_file(&char_enclose_, data_writer)); + } + return ret; +} + +int ObSelectIntoOp::print_field(const ObObj &obj, ObCsvFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + char char_n = 'N'; + const bool need_enclose = has_enclose_ && !obj.is_null() + && (!is_optional_ || obj.is_string_type() || obj.is_collection_sql_type() + || obj.is_json() || obj.is_geometry() || obj.is_date() + || obj.is_time() || obj.is_timestamp() || obj.is_datetime() + || obj.is_mysql_date() || obj.is_mysql_datetime()); + if (need_enclose) { + OZ(write_single_char_to_file(&char_enclose_, data_writer)); + } + if (!has_escape_) { + OZ(write_obj_to_file(obj, data_writer, false)); + } else if (obj.is_null()) { + OZ(write_single_char_to_file(&char_escape_, data_writer)); + OZ(write_single_char_to_file(&char_n, data_writer)); + } else { + OZ(write_obj_to_file(obj, data_writer, true)); + } + if (need_enclose) { + OZ(write_single_char_to_file(&char_enclose_, data_writer)); + } + return ret; +} + +int ObSelectIntoOp::into_outfile(ObExternalFileWriter *data_writer) +{ + int ret = OB_SUCCESS; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + ObDatum *datum = NULL; + ObObj obj; + ObDatum *partition_datum = NULL; + ObCsvFileWriter *csv_data_writer = NULL; + if (do_partition_) { + if (OB_FAIL(MY_SPEC.file_partition_expr_->eval(eval_ctx_, partition_datum))) { + LOG_WARN("eval expr failed", K(ret)); + } else if (OB_ISNULL(partition_datum)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (OB_FAIL(get_data_writer_for_partition(partition_datum->get_string(), data_writer))) { + LOG_WARN("failed to set data writer for partition", K(ret)); + } + } + if (OB_SUCC(ret)) { + if (OB_ISNULL(csv_data_writer = static_cast(data_writer))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null data writer", K(ret)); + } + } + for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { + if (OB_ISNULL(select_exprs.at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("select expr is unexpected null", K(ret)); + } else if (OB_FAIL(select_exprs.at(i)->eval(eval_ctx_, datum))) { + LOG_WARN("eval expr failed", K(ret)); + } else if (OB_ISNULL(datum)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("datum is unexpected null", K(ret)); + } else if (OB_FAIL(datum->to_obj(obj, + select_exprs.at(i)->obj_meta_, + select_exprs.at(i)->obj_datum_map_))) { + LOG_WARN("failed to get obj from datum", K(ret)); + } else if (!ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type()) || obj.is_null()) { + OZ(print_field(obj, *csv_data_writer)); + } else { // text tc + OZ(print_lob_field(obj, *select_exprs.at(i), *datum, *csv_data_writer)); + } + // print field terminator + if (OB_SUCC(ret) && i != select_exprs.count() - 1) { + OZ(write_obj_to_file(field_str_, *csv_data_writer)); + } + } + // print line terminator + OZ(write_obj_to_file(line_str_, *csv_data_writer)); + // check if need split file + OZ(check_csv_file_size(*csv_data_writer)); + // clear shared buffer + OZ(csv_data_writer->flush_shared_buf(shared_buf_)); + if (has_compress_) { + OZ(csv_data_writer->flush_buf()); + } + return ret; +} + +static OB_INLINE int get_cast_ret(const bool is_strict_mode, int ret) +{ + if (OB_SUCCESS != ret && !is_strict_mode) { + ret = OB_SUCCESS; + } + return ret; +} + +#define CAST_FAIL(stmt) \ + (OB_UNLIKELY((OB_SUCCESS != (ret = get_cast_ret((is_strict_mode), (stmt)))))) + + +int ObSelectIntoOp::decimal_to_string(const ObDatum &datum, + const ObDatumMeta &datum_meta, + std::string &res, + ObIAllocator &allocator) +{ + int ret = OB_SUCCESS; + char *buf = NULL; + int64_t pos = 0; + if (OB_ISNULL(buf = static_cast(allocator.alloc(OB_CAST_TO_VARCHAR_MAX_LENGTH)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to alloc memory", K(ret)); + } else if (OB_FAIL(wide::to_string(datum.get_decimal_int(), datum.get_int_bytes(), datum_meta.scale_, + buf, OB_CAST_TO_VARCHAR_MAX_LENGTH, pos))) { + LOG_WARN("failed to get string", K(ret)); + } else { + res.assign(buf, pos); + } + return ret; +} + + +int ObSelectIntoOp::into_outfile_batch_csv(const ObBatchRows &brs, ObExternalFileWriter *data_writer) +{ + int ret = OB_SUCCESS; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + ObArray datum_vectors; + ObDatum *datum = NULL; + ObObj obj; + ObDatumVector partition_datum_vector; + ObCsvFileWriter *csv_data_writer = NULL; + for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { + if (OB_FAIL(select_exprs.at(i)->eval_batch(eval_ctx_, *brs.skip_, brs.size_))) { + LOG_WARN("failed to eval batch", K(ret)); + } else if (OB_FAIL(datum_vectors.push_back(select_exprs.at(i)->locate_expr_datumvector(eval_ctx_)))) { + LOG_WARN("failed to push back datum vector", K(ret)); + } + } + + if (OB_SUCC(ret) && do_partition_) { + if (OB_FAIL(MY_SPEC.file_partition_expr_->eval_batch(eval_ctx_, *brs.skip_, brs.size_))) { + LOG_WARN("failed to eval batch", K(ret)); + } else { + partition_datum_vector = MY_SPEC.file_partition_expr_->locate_expr_datumvector(eval_ctx_); + } + } + for (int64_t i = 0; OB_SUCC(ret) && i < brs.size_; ++i) { + if (brs.skip_->contain(i)) { + // do nothing + } else if (do_partition_ && OB_ISNULL(partition_datum_vector.at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (do_partition_ && OB_FAIL(get_data_writer_for_partition(partition_datum_vector.at(i)->get_string(), + data_writer))) { + LOG_WARN("failed to set data writer for partition", K(ret)); + } else if (OB_ISNULL(csv_data_writer = static_cast(data_writer))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null data writer", K(ret)); + } else if (has_compress_ && OB_ISNULL(csv_data_writer->get_compress_stream_writer()) + && OB_FAIL(csv_data_writer->init_compress_writer(ctx_.get_allocator(), + external_properties_.csv_format_.compression_algorithm_, + MY_SPEC.buffer_size_))) { + LOG_WARN("failed to init compress stream writer", K(ret)); + } else { + for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); ++col_idx) { + if (OB_ISNULL(datum = datum_vectors.at(col_idx).at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("datum is unexpected null", K(ret)); + } else if (OB_FAIL(datum->to_obj(obj, + select_exprs.at(col_idx)->obj_meta_, + select_exprs.at(col_idx)->obj_datum_map_))) { + LOG_WARN("failed to get obj from datum", K(ret)); + } else if (!ob_is_text_tc(select_exprs.at(col_idx)->obj_meta_.get_type()) || obj.is_null()) { + OZ(print_field(obj, *csv_data_writer)); + } else { // text tc + OZ(print_lob_field(obj, *select_exprs.at(col_idx), *datum, *csv_data_writer)); + } + // print field terminator + if (OB_SUCC(ret) && col_idx != select_exprs.count() - 1) { + OZ(write_obj_to_file(field_str_, *csv_data_writer)); + } + } + // print line terminator + OZ(write_obj_to_file(line_str_, *csv_data_writer)); + // check if need split file + OZ(check_csv_file_size(*csv_data_writer)); + // clear shared buffer + OZ(csv_data_writer->flush_shared_buf(shared_buf_)); + if (has_compress_) { + OZ(csv_data_writer->flush_buf()); + } + } + } + return ret; +} + +int ObSelectIntoOp::get_data_from_expr_vector(const common::ObIVector* expr_vector, + int row_idx, + ObObjType type, + int64_t &value, + const bool is_strict_mode, + const ObDateSqlMode date_sql_mode) +{ + int ret = OB_SUCCESS; + int32_t date; + switch(type) { + case ObTinyIntType: + value = expr_vector->get_tinyint(row_idx); + break; + case ObSmallIntType: + value = expr_vector->get_smallint(row_idx); + break; + case ObMediumIntType: + value = expr_vector->get_mediumint(row_idx); + break; + case ObInt32Type: + value = expr_vector->get_int32(row_idx); + break; + case ObIntType: + value = expr_vector->get_int(row_idx); + break; + case ObYearType: + value = expr_vector->get_year(row_idx); + break; + case ObDateType: + value = expr_vector->get_date(row_idx); + break; + case ObMySQLDateType: + CAST_FAIL( + ObTimeConverter::mdate_to_date(expr_vector->get_mysql_date(row_idx), date, date_sql_mode)); + value = date; + break; + case ObMySQLDateTimeType: + CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(expr_vector->get_mysql_datetime(row_idx), value, + date_sql_mode)); + break; + default: + ret = OB_OBJ_TYPE_ERROR; + } + return ret; +} + +bool ObSelectIntoOp::file_need_split(int64_t file_size) +{ + return (file_location_ == IntoFileLocation::SERVER_DISK + && !MY_SPEC.is_single_ && file_size > MY_SPEC.max_file_size_) + || (file_location_ != IntoFileLocation::SERVER_DISK + && ((!MY_SPEC.is_single_ && file_size > min(MY_SPEC.max_file_size_, MAX_OSS_FILE_SIZE)) + || (MY_SPEC.is_single_ && file_size > MAX_OSS_FILE_SIZE))); +} + +int ObSelectIntoOp::check_oracle_number(ObObjType obj_type, int16_t &precision, int8_t scale) +{ + int ret = OB_SUCCESS; + return ret; +} + +int ObSelectIntoOp::calc_byte_array(const common::ObIVector* expr_vector, + int row_idx, + const ObDatumMeta &datum_meta, + const ObObjMeta &obj_meta, + ObIAllocator &allocator, + char* &buf, + uint32_t &res_len) +{ + int ret = OB_SUCCESS; + ObString ob_str; + ObString res_str; + bool has_lob_header = obj_meta.has_lob_header(); + res_len = 0; + buf = nullptr; + int64_t buf_size = 0; + if (OB_FAIL(ObTextStringHelper::read_real_string_data(allocator, expr_vector, datum_meta, + has_lob_header, ob_str, row_idx))) { + LOG_WARN("failed to get string", K(ret)); + } else if (ob_str.length() == 0 || CS_TYPE_BINARY == datum_meta.cs_type_ + || CHARSET_UTF8MB4 == ObCharset::charset_type_by_coll(datum_meta.cs_type_)) { + if (OB_FAIL(ob_write_string(allocator, ob_str, res_str))) { + LOG_WARN("failed to write string", K(ret)); + } else { + res_len = static_cast(res_str.length()); + buf = const_cast(res_str.ptr()); + } + } else if (OB_FALSE_IT(buf_size = ob_str.length() * ObCharset::MAX_MB_LEN)) { + } else if (OB_ISNULL(buf = static_cast(allocator.alloc(buf_size)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to alloc memory", K(ret), K(buf_size)); + } else if (OB_FAIL(ObCharset::charset_convert(datum_meta.cs_type_, ob_str.ptr(), + ob_str.length(), CS_TYPE_UTF8MB4_BIN, + buf, buf_size, res_len, false, false))) { + LOG_WARN("failed to convert charset", K(ret)); + } + return ret; +} + +#ifndef OB_BUILD_EMBED_MODE +int ObSelectIntoOp::init_parquet_env() +{ + int ret = OB_SUCCESS; + arrow_alloc_.init(MTL_ID()); + if (OB_FAIL(setup_parquet_schema())) { + LOG_WARN("failed to set up parquet schema", K(ret)); + } else if (OB_FAIL(init_env_common())) { + LOG_WARN("failed to init env common", K(ret)); + } + return ret; +} + +int ObSelectIntoOp::get_parquet_logical_type(std::shared_ptr &logical_type, + const ObObjType &obj_type, + const int32_t precision, + const int32_t scale) +{ + int ret = OB_SUCCESS; + if (ObTinyIntType == obj_type) { + logical_type = parquet::LogicalType::Int(8, true); + } else if (ObSmallIntType == obj_type) { + logical_type = parquet::LogicalType::Int(16, true); + } else if (ObMediumIntType == obj_type || ObInt32Type == obj_type) { + logical_type = parquet::LogicalType::Int(32, true); + } else if (ObIntType == obj_type) { + logical_type = parquet::LogicalType::Int(64, true); + } else if (ObUTinyIntType == obj_type) { + logical_type = parquet::LogicalType::Int(8, false); + } else if (ObUSmallIntType == obj_type) { + logical_type = parquet::LogicalType::Int(16, false); + } else if (ObUMediumIntType == obj_type || ObUInt32Type == obj_type) { + logical_type = parquet::LogicalType::Int(32, false); + } else if (ObUInt64Type == obj_type) { + logical_type = parquet::LogicalType::Int(64, false); + } else if (ob_is_float_tc(obj_type) || ob_is_double_tc(obj_type)) { // float, ufloat, double, udouble + logical_type = parquet::LogicalType::None(); + } else if (ob_is_number_or_decimal_int_tc(obj_type)) { + logical_type = parquet::LogicalType::Decimal(precision, scale); + } else if (ob_is_datetime_or_mysql_datetime(obj_type)) { + logical_type = parquet::LogicalType::Timestamp(false, parquet::LogicalType::TimeUnit::MICROS); + } else if (ObTimestampType == obj_type) { + logical_type = parquet::LogicalType::Timestamp(true, parquet::LogicalType::TimeUnit::MICROS); + } else if (ob_is_date_or_mysql_date(obj_type)) { + logical_type = parquet::LogicalType::Date(); + } else if (ob_is_time_tc(obj_type)) { + logical_type = parquet::LogicalType::Time(false, parquet::LogicalType::TimeUnit::MICROS); + } else if (ob_is_year_tc(obj_type)) { + logical_type = parquet::LogicalType::Int(8, false); + } else if (ob_is_string_type(obj_type) || ObNullType == obj_type) { + logical_type = parquet::LogicalType::String(); + } else if (ob_is_bit_tc(obj_type) /*uint64_t*/) { + logical_type = parquet::LogicalType::Int(64, false); + } else if (ob_is_enum_or_set_type(obj_type) /*uint64_t*/) { + logical_type = parquet::LogicalType::Enum(); + } else { + // TODO(bitao): support json/bson/uuid/map/list + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "convert this ob type to parquet type"); + LOG_WARN("unsupported obj type", K(ret), K(obj_type)); + } + return ret; +} + +int ObSelectIntoOp::get_parquet_physical_type(parquet::Type::type &physical_type, + const ObObjType &obj_type) +{ + int ret = OB_SUCCESS; + if (ObTinyIntType == obj_type || ObSmallIntType == obj_type + || ObMediumIntType == obj_type || ObInt32Type == obj_type + || ObUTinyIntType == obj_type || ObUSmallIntType == obj_type + || ObUMediumIntType == obj_type || ObUInt32Type == obj_type + || ob_is_date_or_mysql_date(obj_type) || ob_is_year_tc(obj_type)) { + physical_type = parquet::Type::INT32; + } else if (ObIntType == obj_type || ObUInt64Type == obj_type + || ob_is_datetime_or_mysql_datetime_tc(obj_type) + || ob_is_time_tc(obj_type) || ob_is_bit_tc(obj_type)) { + physical_type = parquet::Type::INT64; + } else if (ob_is_float_tc(obj_type)) { // float, ufloat + physical_type = parquet::Type::FLOAT; + } else if (ob_is_double_tc(obj_type)) { // double, udouble + physical_type = parquet::Type::DOUBLE; + } else if (ob_is_number_or_decimal_int_tc(obj_type)) { + physical_type = parquet::Type::FIXED_LEN_BYTE_ARRAY; + } else if (ob_is_string_tc(obj_type) /*varchar,char,varbinary,binary*/ + || ob_is_text_tc(obj_type) /*TinyText,MediumText,Text,LongText,TinyBLOB,MediumBLOB,BLOB,LongBLOB*/ + || ob_is_enum_or_set_type(obj_type) + || ObNullType == obj_type) { + physical_type = parquet::Type::BYTE_ARRAY; + } else { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "convert this ob type to parquet type"); + LOG_WARN("unsupported obj type", K(ret), K(obj_type)); + } + return ret; +} + +int ObSelectIntoOp::calc_parquet_decimal_length(int precision) +{ + // Put in utils? + return std::ceil((1 + precision / std::log10(2)) / 8); +} + +int ObSelectIntoOp::setup_parquet_schema() +{ + int ret = OB_SUCCESS; + ObMallocHookAttrGuard guard(ObMemAttr(MTL_ID(), "IntoParquet")); + parquet::schema::NodeVector fields; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + std::shared_ptr logical_type; + parquet::Type::type physical_type; + parquet::schema::NodePtr node; + try { + for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { + ObDatumMeta meta = select_exprs.at(i)->datum_meta_; + ObObjType obj_type = meta.get_type(); + ObString alias_name = MY_SPEC.alias_names_.strs_.at(i); + std::string column_name(alias_name.ptr(), alias_name.length()); + int primitive_length = -1; + if (OB_FAIL(check_oracle_number(obj_type, + select_exprs.at(i)->datum_meta_.precision_, + select_exprs.at(i)->datum_meta_.scale_))) { + LOG_WARN("not support number type", K(ret)); + } else if (OB_FAIL(get_parquet_logical_type(logical_type, + obj_type, + select_exprs.at(i)->datum_meta_.precision_, + select_exprs.at(i)->datum_meta_.scale_))) { + LOG_WARN("failed to get related logical type", K(ret)); + } else if (OB_FAIL(get_parquet_physical_type(physical_type, obj_type))) { + LOG_WARN("failed to get related physical type", K(ret)); + } else if (ob_is_number_or_decimal_int_tc(obj_type) + && OB_FALSE_IT(primitive_length = calc_parquet_decimal_length( + select_exprs.at(i)->datum_meta_.precision_))) { + } else { + //todo@linyi repetition level + node = parquet::schema::PrimitiveNode::Make(column_name, parquet::Repetition::OPTIONAL, + logical_type, physical_type, primitive_length); + fields.push_back(node); + } + } + if (OB_SUCC(ret)) { + parquet_writer_schema_ = std::static_pointer_cast( + parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + } + } catch (const std::exception& ex) { + if (OB_SUCC(ret)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("caught exception when setup parquet schema", K(ret), "Info", ex.what()); + LOG_USER_ERROR(OB_ERR_UNEXPECTED, ex.what()); + } + } catch (...) { + if (OB_SUCC(ret)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("caught exception when setup parquet schema", K(ret)); + } + } + return ret; +} + +int ObSelectIntoOp::into_outfile_batch_parquet(const ObBatchRows &brs, ObExternalFileWriter *data_writer) +{ + int ret = OB_SUCCESS; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + ObArray expr_vectors; + common::ObIVector* partition_vector; + int64_t estimated_bytes = 0; + int64_t row_group_size = 0; + int64_t file_size = 0; + ObParquetFileWriter *parquet_data_writer = NULL; + ObSQLMode sql_mode = eval_ctx_.exec_ctx_.get_my_session()->get_sql_mode(); + ObDateSqlMode date_sql_mode; + date_sql_mode.init(sql_mode); + bool is_strict_mode = common::is_strict_mode(sql_mode); + for (int64_t i = 0; OB_SUCC(ret) && i < select_exprs.count(); ++i) { + if (OB_ISNULL(select_exprs.at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (OB_FAIL(select_exprs.at(i)->eval_vector(eval_ctx_, brs))) { + LOG_WARN("failed to eval vector", K(ret)); + } else if (OB_FAIL(expr_vectors.push_back(select_exprs.at(i)->get_vector(eval_ctx_)))) { + LOG_WARN("failed to push back vector", K(ret)); + } + } + if (OB_SUCC(ret) && do_partition_) { + if (OB_FAIL(MY_SPEC.file_partition_expr_->eval_vector(eval_ctx_, brs))) { + LOG_WARN("failed to eval batch", K(ret)); + } else if (OB_ISNULL(partition_vector = MY_SPEC.file_partition_expr_->get_vector(eval_ctx_))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null vector", K(ret)); + } + } + for (int64_t row_idx = 0; OB_SUCC(ret) && row_idx < brs.size_; ++row_idx) { + if (brs.skip_->contain(row_idx)) { + // do nothing + } else if (do_partition_ && OB_FAIL(get_data_writer_for_partition(partition_vector->get_string(row_idx), + data_writer))) { + LOG_WARN("failed to set data writer for partition", K(ret)); + } else if (OB_ISNULL(parquet_data_writer = static_cast(data_writer))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null data writer", K(ret)); + } else if (parquet_data_writer->is_file_writer_null() + && OB_FAIL(parquet_data_writer->open_parquet_file_writer(arrow_alloc_, + external_properties_.parquet_format_.row_group_size_, + external_properties_.parquet_format_.compress_type_index_, + brs.size_, + ctx_.get_allocator()))) { + LOG_WARN("failed to init parquet file writer", K(ret)); + } else if (!parquet_data_writer->is_valid_to_write()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else { + try { + for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); col_idx++) { + if (OB_FAIL(build_parquet_cell(parquet_data_writer->get_row_group_writer(), + select_exprs.at(col_idx)->datum_meta_, + select_exprs.at(col_idx)->obj_meta_, + expr_vectors.at(col_idx), + col_idx, + row_idx, + parquet_data_writer->get_row_batch_offset(), + parquet_data_writer->get_parquet_value_offsets().at(col_idx), + parquet_data_writer->get_parquet_row_def_levels().at(col_idx), + parquet_data_writer->get_batch_allocator(), + parquet_data_writer->get_parquet_row_batch().at(col_idx), + is_strict_mode, + date_sql_mode))) { + LOG_WARN("failed to build parquet cell", K(ret)); + } + } + parquet_data_writer->set_batch_written(false); + parquet_data_writer->increase_row_batch_offset(); + if (OB_FAIL(ret)) { + // discard unwritten data if an error occurs + parquet_data_writer->set_batch_written(true); + parquet_data_writer->reset_row_batch_offset(); + parquet_data_writer->reset_value_offsets(); + } else if (parquet_data_writer->reach_batch_end()) { + if (OB_FAIL(parquet_data_writer->write_file())) { + LOG_WARN("failed to write parquet row batch", K(ret)); + } else if (OB_FAIL(check_parquet_file_size(*parquet_data_writer))) { + LOG_WARN("failed to check parquet file size", K(ret)); + } + parquet_data_writer->set_batch_written(true); + parquet_data_writer->reset_row_batch_offset(); + parquet_data_writer->reset_value_offsets(); + } + } catch (const std::exception& ex) { + if (OB_SUCC(ret)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("caught exception when write parquet file", K(ret), "Info", ex.what()); + LOG_USER_ERROR(OB_ERR_UNEXPECTED, ex.what()); + } + } catch (...) { + if (OB_SUCC(ret)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("caught exception when write parquet file", K(ret)); + } + } + } + } + return ret; +} + +int ObSelectIntoOp::oracle_timestamp_to_int96(const common::ObIVector* expr_vector, + int64_t row_idx, + const ObDatumMeta &datum_meta, + parquet::Int96 &res) +{ + int ret = OB_SUCCESS; + int64_t out_usec = 0; + int32_t tmp_offset = 0; + ObOTimestampData oracle_timestamp; + uint32_t julian_date_value = (out_usec / 86400000000LL) + 2440588; + uint64_t nsec_time_value = oracle_timestamp.time_ctx_.tail_nsec_ + std::abs(out_usec % 86400000000LL) * 1000; + res.value[2] = julian_date_value; + res.value[1] = nsec_time_value >> 32; + res.value[0] = nsec_time_value & UINT32_MAX; + return ret; +} + +int ObSelectIntoOp::check_parquet_file_size(ObParquetFileWriter &data_writer) +{ + int ret = OB_SUCCESS; + int64_t row_group_size = data_writer.get_row_group_size(); + int64_t file_size = data_writer.get_file_size(); + if (file_need_split(file_size)) { + if (OB_FAIL(split_file(data_writer))) { + LOG_WARN("failed to split file", K(ret)); + } else { + data_writer.set_write_bytes(0); + } + } else if (row_group_size > external_properties_.parquet_format_.row_group_size_) { + data_writer.get_row_group_writer()->Close(); + data_writer.set_write_bytes(file_size); + data_writer.open_next_row_group_writer(); + } + return ret; +} + +int ObSelectIntoOp::build_parquet_cell(parquet::RowGroupWriter* rg_writer, + const ObDatumMeta &datum_meta, + const ObObjMeta &obj_meta, + const common::ObIVector* expr_vector, + int64_t col_idx, + int64_t row_idx, + int64_t row_offset, + int64_t &value_offset, + int16_t* definition_levels, + ObIAllocator &allocator, + void* value_batch, + const bool is_strict_mode, + const ObDateSqlMode date_sql_mode) +{ + int ret = OB_SUCCESS; + int16_t null_definition_level = 0; + int16_t normal_definition_level = 1; + std::shared_ptr p_node; + parquet::ColumnWriter *col_writer = nullptr; + if (OB_ISNULL(expr_vector) || !parquet_writer_schema_ || OB_ISNULL(rg_writer) + || OB_ISNULL(col_writer = rg_writer->column(col_idx)) + || OB_ISNULL(definition_levels) || OB_ISNULL(value_batch) + || !(p_node = std::static_pointer_cast(parquet_writer_schema_->field(col_idx)))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get null ptr", K(ret)); + } else if (p_node->is_group()) { + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "group type in parquet"); + LOG_WARN("not support group type in parquet", K(ret)); + } else { + switch (p_node->physical_type()) { + case parquet::Type::BYTE_ARRAY: + { + parquet::ByteArray* value = reinterpret_cast(value_batch); + value += value_offset; + char *buf = nullptr; + uint32_t res_len = 0; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (OB_FAIL(calc_byte_array(expr_vector, row_idx, - parquet_data_writer->get_row_batch_offset(), - parquet_data_writer->get_parquet_value_offsets().at(col_idx), - parquet_data_writer->get_parquet_row_def_levels().at(col_idx), - parquet_data_writer->get_batch_allocator(), - parquet_data_writer->get_parquet_row_batch().at(col_idx), - is_strict_mode, - date_sql_mode))) { - LOG_WARN("failed to build parquet cell", K(ret)); - } - } - parquet_data_writer->set_batch_written(false); - parquet_data_writer->increase_row_batch_offset(); - if (OB_FAIL(ret)) { - // discard unwritten data if an error occurs - parquet_data_writer->set_batch_written(true); - parquet_data_writer->reset_row_batch_offset(); - parquet_data_writer->reset_value_offsets(); - } else if (parquet_data_writer->reach_batch_end()) { - if (OB_FAIL(parquet_data_writer->write_file())) { - LOG_WARN("failed to write parquet row batch", K(ret)); - } else if (OB_FAIL(check_parquet_file_size(*parquet_data_writer))) { - LOG_WARN("failed to check parquet file size", K(ret)); - } - parquet_data_writer->set_batch_written(true); - parquet_data_writer->reset_row_batch_offset(); - parquet_data_writer->reset_value_offsets(); - } - } catch (const std::exception& ex) { - if (OB_SUCC(ret)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("caught exception when write parquet file", K(ret), "Info", ex.what()); - LOG_USER_ERROR(OB_ERR_UNEXPECTED, ex.what()); - } - } catch (...) { - if (OB_SUCC(ret)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("caught exception when write parquet file", K(ret)); - } - } - } - } - return ret; - } - - int ObSelectIntoOp::oracle_timestamp_to_int96(const common::ObIVector* expr_vector, - int64_t row_idx, - const ObDatumMeta &datum_meta, - parquet::Int96 &res) - { - int ret = OB_SUCCESS; - int64_t out_usec = 0; - int32_t tmp_offset = 0; - ObOTimestampData oracle_timestamp; - uint32_t julian_date_value = (out_usec / 86400000000LL) + 2440588; - uint64_t nsec_time_value = oracle_timestamp.time_ctx_.tail_nsec_ + std::abs(out_usec % 86400000000LL) * 1000; - res.value[2] = julian_date_value; - res.value[1] = nsec_time_value >> 32; - res.value[0] = nsec_time_value & UINT32_MAX; - return ret; - } - - int ObSelectIntoOp::check_parquet_file_size(ObParquetFileWriter &data_writer) - { - int ret = OB_SUCCESS; - int64_t row_group_size = data_writer.get_row_group_size(); - int64_t file_size = data_writer.get_file_size(); - if (file_need_split(file_size)) { - if (OB_FAIL(split_file(data_writer))) { - LOG_WARN("failed to split file", K(ret)); - } else { - data_writer.set_write_bytes(0); - } - } else if (row_group_size > external_properties_.parquet_format_.row_group_size_) { - data_writer.get_row_group_writer()->Close(); - data_writer.set_write_bytes(file_size); - data_writer.open_next_row_group_writer(); - } - return ret; - } - - int ObSelectIntoOp::build_parquet_cell(parquet::RowGroupWriter* rg_writer, - const ObDatumMeta &datum_meta, - const ObObjMeta &obj_meta, - const common::ObIVector* expr_vector, - int64_t col_idx, - int64_t row_idx, - int64_t row_offset, - int64_t &value_offset, - int16_t* definition_levels, - ObIAllocator &allocator, - void* value_batch, - const bool is_strict_mode, - const ObDateSqlMode date_sql_mode) - { - int ret = OB_SUCCESS; - int16_t null_definition_level = 0; - int16_t normal_definition_level = 1; - std::shared_ptr p_node; - parquet::ColumnWriter *col_writer = nullptr; - if (OB_ISNULL(expr_vector) || !parquet_writer_schema_ || OB_ISNULL(rg_writer) - || OB_ISNULL(col_writer = rg_writer->column(col_idx)) - || OB_ISNULL(definition_levels) || OB_ISNULL(value_batch) - || !(p_node = std::static_pointer_cast(parquet_writer_schema_->field(col_idx)))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get null ptr", K(ret)); - } else if (p_node->is_group()) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "group type in parquet"); - LOG_WARN("not support group type in parquet", K(ret)); - } else { - switch (p_node->physical_type()) { - case parquet::Type::BYTE_ARRAY: - { - parquet::ByteArray* value = reinterpret_cast(value_batch); - value += value_offset; - char *buf = nullptr; - uint32_t res_len = 0; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (OB_FAIL(calc_byte_array(expr_vector, - row_idx, - datum_meta, - obj_meta, - allocator, - buf, - res_len))) { - LOG_WARN("failed to calc parquet byte array", K(ret)); - } else { - value->ptr = reinterpret_cast(buf); - value->len = res_len; - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::FIXED_LEN_BYTE_ARRAY: - { - parquet::FixedLenByteArray* value = reinterpret_cast(value_batch); - value += value_offset; - parquet::FixedLenByteArrayWriter *writer = static_cast(col_writer); - int parquet_decimal_length = writer->descr()->type_length(); - ObArrayWrap parquet_flba; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (OB_FAIL(parquet_flba.allocate_array(allocator, parquet_decimal_length))) { - LOG_WARN("failed to allocate array", K(ret)); - } else if (OB_FAIL(calc_parquet_decimal_array(expr_vector, - row_idx, - datum_meta, - parquet_decimal_length, - parquet_flba.get_data()))) { - LOG_WARN("failed to calc parquet decimal", K(ret)); - } else { - value->ptr = parquet_flba.get_data(); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::DOUBLE: - { - double* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else { - *value = expr_vector->get_double(row_idx); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::FLOAT: - { - float* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else { - *value = expr_vector->get_float(row_idx); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::INT32: - { - int32_t* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (ob_is_mysql_date_tc(datum_meta.type_)) { - ObMySQLDate mdate(expr_vector->get_int32(row_idx)); - if (CAST_FAIL(ObTimeConverter::mdate_to_date(mdate, *value, date_sql_mode))) { - LOG_WARN("mdate_to_date fail", K(ret)); - } else { - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - } else { - *value = expr_vector->get_int32(row_idx); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::INT64: - { - int64_t* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (ob_is_mysql_datetime(datum_meta.type_)) { - ObMySQLDateTime mdatetime(expr_vector->get_int(row_idx)); - if (CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(mdatetime, *value, date_sql_mode))) { - LOG_WARN("mdatetime_to_datetime fail", K(ret)); - } else { - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - } else { - *value = expr_vector->get_int(row_idx); - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - case parquet::Type::INT96: - { - parquet::Int96* value = reinterpret_cast(value_batch); - value += value_offset; - if (expr_vector->is_null(row_idx)) { - definition_levels[row_offset] = null_definition_level; - } else if (OB_FAIL(oracle_timestamp_to_int96(expr_vector, row_idx, datum_meta, *value))) { - LOG_WARN("failed to convert timestamp to int96", K(ret)); - } else { - value_offset++; - definition_levels[row_offset] = normal_definition_level; - } - break; - } - default: - { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected type", K(p_node->physical_type()), K(ret)); - } - } - } - return ret; - } - - int ObSelectIntoOp::calc_parquet_decimal_array(const common::ObIVector* expr_vector, - int row_idx, - const ObDatumMeta &datum_meta, - int parquet_decimal_length, - uint8_t* parquet_flba_ptr) - { - int ret = OB_SUCCESS; - const ObDecimalInt* ob_decimal; - const uint8_t* decimal_bytes; - ObDecimalIntBuilder tmp_dec_alloc; - ObDecimalInt* tmp_decimal; - int ob_decimal_length = wide::ObDecimalIntConstValue::get_int_bytes_by_precision(datum_meta.precision_); - if (ob_is_decimal_int_tc(datum_meta.get_type())) { - ob_decimal = expr_vector->get_decimal_int(row_idx); - } else if (ob_is_number_tc(datum_meta.get_type())) { - number::ObNumber number(expr_vector->get_number(row_idx)); - if (OB_FAIL(wide::from_number_to_decimal_fixed_length(number, tmp_dec_alloc, datum_meta.scale_, - ob_decimal_length, tmp_decimal))){ - LOG_WARN("failed to case number to decimal int", K(ret)); - } else { - ob_decimal = tmp_decimal; - } - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected type", K(datum_meta.get_type())); - } - if (OB_FAIL(ret)) { - } else if (ob_decimal_length < parquet_decimal_length) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected decimal length", K(ob_decimal_length), K(parquet_decimal_length), K(ret)); - } else { - switch (ob_decimal_length) { - case sizeof(int32_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int32_v_); - break; - } - case sizeof(int64_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int64_v_); - break; - } - case sizeof(int128_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int128_v_); - break; - } - case sizeof(int256_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int256_v_); - break; - } - case sizeof(int512_t): - { - decimal_bytes = reinterpret_cast(ob_decimal->int512_v_); - break; - } - default: - { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected type", K(ob_decimal_length), K(ret)); - } - } - } - for (int i = 0; OB_SUCC(ret) && i < parquet_decimal_length; i++) { - parquet_flba_ptr[i] = decimal_bytes[parquet_decimal_length - i - 1]; - } - return ret; - } - #endif // !OB_BUILD_EMBED_MODE - - int ObSelectIntoOp::into_dumpfile(ObExternalFileWriter *data_writer) - { - int ret = OB_SUCCESS; - char buf[MAX_VALUE_LENGTH]; - int64_t buf_len = MAX_VALUE_LENGTH; - int64_t pos = 0; - if (OB_ISNULL(data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (OB_FAIL(get_row_str(buf_len, is_first_, buf, pos))) { - LOG_WARN("get str failed", K(ret)); - } else if (is_first_) { // create file - if (OB_FAIL(data_writer->file_appender_.create(file_name_.get_varchar(), true))) { - LOG_WARN("create dumpfile failed", K(ret), K(file_name_)); - } else { - is_first_ = false; - } - } - if (OB_SUCC(ret)) { - if (OB_FAIL(data_writer->file_appender_.append(buf, pos, false))) { - LOG_WARN("failed to append file"); - } else { - //do nothing - } - } - return ret; - } - - int ObSelectIntoOp::into_varlist() - { - int ret = OB_SUCCESS; - //before 4_1 use output - //after 4_1 use select exprs - const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? - MY_SPEC.output_ : MY_SPEC.select_exprs_; - const ObIArray &user_vars = MY_SPEC.user_vars_; - ObArenaAllocator lob_tmp_allocator("LobTmp", OB_MALLOC_NORMAL_BLOCK_SIZE, MTL_ID()); - if (select_exprs.count() != user_vars.count()) { - ret = OB_ERR_COLUMN_SIZE; - LOG_WARN("user vars count should be equal to select exprs count" , K(ret), - K(select_exprs.count()), K(user_vars.count())); - } else { - for (int i = 0 ; i < user_vars.count(); ++i) { - const ObString &var_name = user_vars.at(i); - ObObj obj; - ObDatum *datum = NULL; - if (OB_FAIL(select_exprs.at(i)->eval(eval_ctx_, datum))) { - LOG_WARN("eval expr failed", K(ret)); - } else if (OB_FAIL(datum->to_obj(obj, select_exprs.at(i)->obj_meta_))) { - LOG_WARN("convert datum to obj failed", K(ret), KPC(select_exprs.at(i))); - } else if (obj.is_lob_storage() - // outrow lob can not be assigned to user var, so convert outrow to inrow lob - // user var has independent memory, so using temporary memory here is fine - && OB_FAIL(ObTextStringIter::convert_outrow_lob_to_inrow_templob(obj, obj, nullptr, &lob_tmp_allocator, true/*allow_persist_inrow*/))) { - LOG_WARN("convert outrow to inrow lob failed", K(ret), K(obj)); - } else if (OB_FAIL(ObVariableSetExecutor::set_user_variable(obj, var_name, - ctx_.get_my_session()))) { - LOG_WARN("set user variable failed", K(ret)); - } - } - } - return ret; - } - - int ObSelectIntoOp::extract_fisrt_wchar_from_varhcar(const ObObj &obj, int32_t &wchar) - { - int ret = OB_SUCCESS; - int32_t length = 0; - if (obj.is_varying_len_char_type()) { - ObString str = obj.get_varchar(); - if (str.length() > 0) { - ret = ObCharset::mb_wc(obj.get_collation_type(), str.ptr(), str.length(), length, wchar); - } - } - return ret; - } - - int ObSelectIntoOp::print_wchar_to_buf(char *buf, - const int64_t buf_len, - int64_t &pos, - int32_t wchar, - ObString &str, - ObCollationType coll_type) - { - int ret = OB_SUCCESS; - int result_len = 0; - if (OB_FAIL(ObCharset::wc_mb(coll_type, wchar, buf + pos, buf_len - pos, result_len))) { - LOG_WARN("failed to convert wc to mb"); - } else { - str = ObString(result_len, buf + pos); - pos += result_len; - } - return ret; - } - - int ObSelectIntoOp::prepare_escape_printer() - { - int ret = OB_SUCCESS; - int64_t pos = 0; - char *buf = NULL; - int64_t buf_len = 6 * ObCharset::MAX_MB_LEN; - // mb->wc - int32_t wchar_enclose = char_enclose_; - int32_t wchar_escape = char_escape_; - int32_t wchar_field = 0; - int32_t wchar_line = 0; - int32_t wchar_zero = '\0'; - int32_t wchar_replace = 0; - OZ(extract_fisrt_wchar_from_varhcar(field_str_, wchar_field)); - OZ(extract_fisrt_wchar_from_varhcar(line_str_, wchar_line)); - OZ(ObCharset::get_replace_character(cs_type_, wchar_replace)); - // wc->mb - if (OB_ISNULL(buf = static_cast(ctx_.get_allocator().alloc(buf_len)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate buffer", K(ret), K(buf_len)); - } - if (has_enclose_) { - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_enclose, escape_printer_.enclose_, cs_type_)); - } - if (has_escape_) { - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_escape, escape_printer_.escape_, cs_type_)); - } - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_zero, escape_printer_.zero_, cs_type_)); - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_field, escape_printer_.field_terminator_, cs_type_)); - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_line, escape_printer_.line_terminator_, cs_type_)); - OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_replace, escape_printer_.convert_replacer_, cs_type_)); - escape_printer_.coll_type_ = cs_type_; - escape_printer_.ignore_convert_failed_ = true; // todo@linyi provide user-defined interface - return ret; - } - - int ObSelectIntoOp::check_has_lob_or_json() - { - int ret = OB_SUCCESS; - const ObIArray &select_exprs = MY_SPEC.select_exprs_; - for (int64_t i = 0; OB_SUCC(ret) && (!has_lob_ || !has_json_ || !has_coll_) && i < select_exprs.count(); ++i) { - if (OB_ISNULL(select_exprs.at(i))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("select expr is unexpected null", K(ret)); - } else if (ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type())) { - has_lob_ = true; - } else if (ob_is_json_tc(select_exprs.at(i)->obj_meta_.get_type())) { - has_json_ = true; - } else if (ob_is_collection_sql_type(select_exprs.at(i)->obj_meta_.get_type())) { - has_coll_ = true; - } - } - return ret; - } - - int ObSelectIntoOp::create_shared_buffer_for_data_writer() - { - int ret = OB_SUCCESS; - shared_buf_len_ = has_lob_ ? (5 * SHARED_BUFFER_SIZE) : SHARED_BUFFER_SIZE; - if (OB_ISNULL(shared_buf_ = static_cast(ctx_.get_allocator().alloc(shared_buf_len_)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate buffer", K(ret), K(shared_buf_len_)); - } - if (OB_SUCC(ret) && (has_json_ || has_coll_) && has_escape_) { - json_buf_len_ = OB_MALLOC_MIDDLE_BLOCK_SIZE; - if (OB_ISNULL(json_buf_ = static_cast(ctx_.get_allocator().alloc(json_buf_len_)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate buffer", K(ret), K(json_buf_len_)); - } - } - return ret; - } - - int ObSelectIntoOp::check_secure_file_path(ObString file_name) - { - int ret = OB_SUCCESS; - ObString file_path = file_name.split_on(file_name.reverse_find('/')); - char full_path_buf[PATH_MAX+1]; - char *actual_path = nullptr; - ObSqlString sql_str; - ObString secure_file_priv; - int64_t tenant_id = MTL_ID(); - if (OB_FAIL(sql_str.append(file_path.empty() ? "." : file_path))) { - LOG_WARN("failed to append string", K(ret)); - #ifdef _WIN32 - } else if (OB_ISNULL(actual_path = _fullpath(full_path_buf, sql_str.ptr(), PATH_MAX))) { - #else - } else if (OB_ISNULL(actual_path = realpath(sql_str.ptr(), full_path_buf))) { - #endif - ret = OB_FILE_NOT_EXIST; - LOG_WARN("file not exist", K(ret), K(sql_str)); - } else if (OB_FAIL(ObSchemaUtils::get_tenant_varchar_variable(tenant_id, - SYS_VAR_SECURE_FILE_PRIV, - ctx_.get_allocator(), - secure_file_priv))) { - LOG_WARN("fail get tenant variable", K(tenant_id), K(secure_file_priv), K(ret)); - } else if (OB_FAIL(ObResolverUtils::check_secure_path(secure_file_priv, actual_path))) { - LOG_WARN("failed to check secure path", K(ret), K(secure_file_priv)); - if (OB_ERR_NO_PRIVILEGE == ret) { - ret = OB_ERR_NO_PRIV_DIRECT_PATH_ACCESS; - LOG_ERROR("failed to check secure path", K(ret), K(secure_file_priv)); - } - } - return ret; - } - - int ObSelectIntoOp::get_data_writer_for_partition(const ObString &partition_str, - ObExternalFileWriter *&data_writer) - { - int ret = OB_SUCCESS; - ObString partition; - ObExternalFileWriter *value = NULL; - ObCsvFileWriter *csv_data_writer = NULL; - if (OB_SUCC(partition_map_.get_refactored(partition_str, value))) { - if (OB_ISNULL(value)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else { - data_writer = value; - } - } else if (OB_UNLIKELY(OB_HASH_NOT_EXIST != ret)) { - LOG_WARN("get unexpected error", K(ret)); - } else if (curr_partition_num_ >= OB_MAX_PARTITION_NUM_ORACLE) { - ret = OB_TOO_MANY_PARTITIONS_ERROR; - LOG_WARN("too many partitions", K(ret)); - } else { - ret = OB_SUCCESS; - bool writer_added = false; - if (OB_FAIL(new_data_writer(data_writer))) { - LOG_WARN("failed to new data writer", K(ret)); - } else if (OB_ISNULL(data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { - csv_data_writer = static_cast(data_writer); - if (OB_FAIL(csv_data_writer->alloc_buf(ctx_.get_allocator(), MY_SPEC.buffer_size_))) { - LOG_WARN("failed to alloc buffer", K(ret)); - } - } - //add to hashmap - if (OB_FAIL(ret)) { - } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), - partition_str, - partition))) { - LOG_WARN("failed to write string", K(ret)); - } else if (OB_FAIL(partition_map_.set_refactored(partition, data_writer))) { - LOG_WARN("failed to add data writer to map", K(ret)); - } else { - curr_partition_num_++; - writer_added = true; - } - if (OB_FAIL(ret) && NULL != data_writer && !writer_added) { - data_writer->~ObExternalFileWriter(); - } - //calc file path - if (OB_SUCC(ret) && OB_FAIL(calc_file_path_with_partition(partition, *data_writer))) { - LOG_WARN("failed to calc file path with partition", K(ret)); - } - } - return ret; - } - - int ObSelectIntoOp::create_the_only_data_writer(ObExternalFileWriter *&data_writer) - { - int ret = OB_SUCCESS; - ObCsvFileWriter *csv_data_writer = NULL; - if (OB_FAIL(new_data_writer(data_writer))) { - LOG_WARN("failed to new data writer", K(ret)); - } else if (OB_ISNULL(data_writer)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get unexpected null", K(ret)); - } else { - data_writer->url_ = basic_url_; - data_writer_ = data_writer; - } - if (OB_FAIL(ret)) { - } else if (T_INTO_OUTFILE == MY_SPEC.into_type_ && MY_SPEC.is_single_ - && OB_FAIL(data_writer->open_file())) { - LOG_WARN("failed to open file", K(ret)); - } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { - csv_data_writer = static_cast(data_writer); - if (OB_FAIL(csv_data_writer->alloc_buf(ctx_.get_allocator(), MY_SPEC.buffer_size_))) { - LOG_WARN("failed to alloc buffer", K(ret)); - } - } - return ret; - } - - int ObSelectIntoOp::new_data_writer(ObExternalFileWriter *&data_writer) - { - int ret = OB_SUCCESS; - void *ptr = NULL; - switch (format_type_) - { - case ObExternalFileFormat::FormatType::CSV_FORMAT: - { - if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObCsvFileWriter)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObCsvFileWriter))); - } else { - data_writer = new(ptr) ObCsvFileWriter(access_info_, file_location_, use_shared_buf_, - has_compress_, has_lob_, write_offset_); - } - break; - } - case ObExternalFileFormat::FormatType::PARQUET_FORMAT: - { - #ifdef OB_BUILD_EMBED_MODE - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet not supported in embed mode", K(ret)); - #else - if (lib::is_embed_mode()) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet not supported in embed mode", K(ret)); - } else if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObParquetFileWriter)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObParquetFileWriter))); - } else { - data_writer = new(ptr) ObParquetFileWriter(access_info_, file_location_, parquet_writer_schema_); - } - #endif - break; - } - case ObExternalFileFormat::FormatType::ORC_FORMAT: - { - ret = OB_NOT_SUPPORTED; - break; - } - default: - { - ret = OB_NOT_SUPPORTED; - LOG_WARN("not support select into type", K(format_type_)); - } - } - return ret; - } - - void ObSelectIntoOp::destroy() - { - ObExternalFileWriter *data_writer = NULL; - if (do_partition_) { - for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); - iter != partition_map_.end(); iter++) { - if (OB_ISNULL(data_writer = iter->second)) { - } else { - data_writer->~ObExternalFileWriter(); - } - } - } else if (OB_NOT_NULL(data_writer_)) { - data_writer_->~ObExternalFileWriter(); - } - #ifndef OB_BUILD_EMBED_MODE - { - ObMallocHookAttrGuard guard(ObMemAttr(MTL_ID(), "IntoParquet")); - parquet_writer_schema_.reset(); - } - #endif - external_properties_.~ObExternalFileFormat(); - partition_map_.destroy(); - ObOperator::destroy(); - } - - #undef ARROW_FAIL - } - } - #undef CAST_FAIL - \ No newline at end of file + datum_meta, + obj_meta, + allocator, + buf, + res_len))) { + LOG_WARN("failed to calc parquet byte array", K(ret)); + } else { + value->ptr = reinterpret_cast(buf); + value->len = res_len; + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::FIXED_LEN_BYTE_ARRAY: + { + parquet::FixedLenByteArray* value = reinterpret_cast(value_batch); + value += value_offset; + parquet::FixedLenByteArrayWriter *writer = static_cast(col_writer); + int parquet_decimal_length = writer->descr()->type_length(); + ObArrayWrap parquet_flba; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (OB_FAIL(parquet_flba.allocate_array(allocator, parquet_decimal_length))) { + LOG_WARN("failed to allocate array", K(ret)); + } else if (OB_FAIL(calc_parquet_decimal_array(expr_vector, + row_idx, + datum_meta, + parquet_decimal_length, + parquet_flba.get_data()))) { + LOG_WARN("failed to calc parquet decimal", K(ret)); + } else { + value->ptr = parquet_flba.get_data(); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::DOUBLE: + { + double* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else { + *value = expr_vector->get_double(row_idx); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::FLOAT: + { + float* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else { + *value = expr_vector->get_float(row_idx); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::INT32: + { + int32_t* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (ob_is_mysql_date_tc(datum_meta.type_)) { + ObMySQLDate mdate(expr_vector->get_int32(row_idx)); + if (CAST_FAIL(ObTimeConverter::mdate_to_date(mdate, *value, date_sql_mode))) { + LOG_WARN("mdate_to_date fail", K(ret)); + } else { + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + } else { + *value = expr_vector->get_int32(row_idx); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::INT64: + { + int64_t* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (ob_is_mysql_datetime(datum_meta.type_)) { + ObMySQLDateTime mdatetime(expr_vector->get_int(row_idx)); + if (CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(mdatetime, *value, date_sql_mode))) { + LOG_WARN("mdatetime_to_datetime fail", K(ret)); + } else { + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + } else { + *value = expr_vector->get_int(row_idx); + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + case parquet::Type::INT96: + { + parquet::Int96* value = reinterpret_cast(value_batch); + value += value_offset; + if (expr_vector->is_null(row_idx)) { + definition_levels[row_offset] = null_definition_level; + } else if (OB_FAIL(oracle_timestamp_to_int96(expr_vector, row_idx, datum_meta, *value))) { + LOG_WARN("failed to convert timestamp to int96", K(ret)); + } else { + value_offset++; + definition_levels[row_offset] = normal_definition_level; + } + break; + } + default: + { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected type", K(p_node->physical_type()), K(ret)); + } + } + } + return ret; +} + +int ObSelectIntoOp::calc_parquet_decimal_array(const common::ObIVector* expr_vector, + int row_idx, + const ObDatumMeta &datum_meta, + int parquet_decimal_length, + uint8_t* parquet_flba_ptr) +{ + int ret = OB_SUCCESS; + const ObDecimalInt* ob_decimal; + const uint8_t* decimal_bytes; + ObDecimalIntBuilder tmp_dec_alloc; + ObDecimalInt* tmp_decimal; + int ob_decimal_length = wide::ObDecimalIntConstValue::get_int_bytes_by_precision(datum_meta.precision_); + if (ob_is_decimal_int_tc(datum_meta.get_type())) { + ob_decimal = expr_vector->get_decimal_int(row_idx); + } else if (ob_is_number_tc(datum_meta.get_type())) { + number::ObNumber number(expr_vector->get_number(row_idx)); + if (OB_FAIL(wide::from_number_to_decimal_fixed_length(number, tmp_dec_alloc, datum_meta.scale_, + ob_decimal_length, tmp_decimal))){ + LOG_WARN("failed to case number to decimal int", K(ret)); + } else { + ob_decimal = tmp_decimal; + } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected type", K(datum_meta.get_type())); + } + if (OB_FAIL(ret)) { + } else if (ob_decimal_length < parquet_decimal_length) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected decimal length", K(ob_decimal_length), K(parquet_decimal_length), K(ret)); + } else { + switch (ob_decimal_length) { + case sizeof(int32_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int32_v_); + break; + } + case sizeof(int64_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int64_v_); + break; + } + case sizeof(int128_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int128_v_); + break; + } + case sizeof(int256_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int256_v_); + break; + } + case sizeof(int512_t): + { + decimal_bytes = reinterpret_cast(ob_decimal->int512_v_); + break; + } + default: + { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected type", K(ob_decimal_length), K(ret)); + } + } + } + for (int i = 0; OB_SUCC(ret) && i < parquet_decimal_length; i++) { + parquet_flba_ptr[i] = decimal_bytes[parquet_decimal_length - i - 1]; + } + return ret; +} +#endif // !OB_BUILD_EMBED_MODE + +int ObSelectIntoOp::into_dumpfile(ObExternalFileWriter *data_writer) +{ + int ret = OB_SUCCESS; + char buf[MAX_VALUE_LENGTH]; + int64_t buf_len = MAX_VALUE_LENGTH; + int64_t pos = 0; + if (OB_ISNULL(data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (OB_FAIL(get_row_str(buf_len, is_first_, buf, pos))) { + LOG_WARN("get str failed", K(ret)); + } else if (is_first_) { // create file + if (OB_FAIL(data_writer->file_appender_.create(file_name_.get_varchar(), true))) { + LOG_WARN("create dumpfile failed", K(ret), K(file_name_)); + } else { + is_first_ = false; + } + } + if (OB_SUCC(ret)) { + if (OB_FAIL(data_writer->file_appender_.append(buf, pos, false))) { + LOG_WARN("failed to append file"); + } else { + //do nothing + } + } + return ret; +} + +int ObSelectIntoOp::into_varlist() +{ + int ret = OB_SUCCESS; + //before 4_1 use output + //after 4_1 use select exprs + const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? + MY_SPEC.output_ : MY_SPEC.select_exprs_; + const ObIArray &user_vars = MY_SPEC.user_vars_; + ObArenaAllocator lob_tmp_allocator("LobTmp", OB_MALLOC_NORMAL_BLOCK_SIZE, MTL_ID()); + if (select_exprs.count() != user_vars.count()) { + ret = OB_ERR_COLUMN_SIZE; + LOG_WARN("user vars count should be equal to select exprs count" , K(ret), + K(select_exprs.count()), K(user_vars.count())); + } else { + for (int i = 0 ; i < user_vars.count(); ++i) { + const ObString &var_name = user_vars.at(i); + ObObj obj; + ObDatum *datum = NULL; + if (OB_FAIL(select_exprs.at(i)->eval(eval_ctx_, datum))) { + LOG_WARN("eval expr failed", K(ret)); + } else if (OB_FAIL(datum->to_obj(obj, select_exprs.at(i)->obj_meta_))) { + LOG_WARN("convert datum to obj failed", K(ret), KPC(select_exprs.at(i))); + } else if (obj.is_lob_storage() + // outrow lob can not be assigned to user var, so convert outrow to inrow lob + // user var has independent memory, so using temporary memory here is fine + && OB_FAIL(ObTextStringIter::convert_outrow_lob_to_inrow_templob(obj, obj, nullptr, &lob_tmp_allocator, true/*allow_persist_inrow*/))) { + LOG_WARN("convert outrow to inrow lob failed", K(ret), K(obj)); + } else if (OB_FAIL(ObVariableSetExecutor::set_user_variable(obj, var_name, + ctx_.get_my_session()))) { + LOG_WARN("set user variable failed", K(ret)); + } + } + } + return ret; +} + +int ObSelectIntoOp::extract_fisrt_wchar_from_varhcar(const ObObj &obj, int32_t &wchar) +{ + int ret = OB_SUCCESS; + int32_t length = 0; + if (obj.is_varying_len_char_type()) { + ObString str = obj.get_varchar(); + if (str.length() > 0) { + ret = ObCharset::mb_wc(obj.get_collation_type(), str.ptr(), str.length(), length, wchar); + } + } + return ret; +} + +int ObSelectIntoOp::print_wchar_to_buf(char *buf, + const int64_t buf_len, + int64_t &pos, + int32_t wchar, + ObString &str, + ObCollationType coll_type) +{ + int ret = OB_SUCCESS; + int result_len = 0; + if (OB_FAIL(ObCharset::wc_mb(coll_type, wchar, buf + pos, buf_len - pos, result_len))) { + LOG_WARN("failed to convert wc to mb"); + } else { + str = ObString(result_len, buf + pos); + pos += result_len; + } + return ret; +} + +int ObSelectIntoOp::prepare_escape_printer() +{ + int ret = OB_SUCCESS; + int64_t pos = 0; + char *buf = NULL; + int64_t buf_len = 6 * ObCharset::MAX_MB_LEN; + // mb->wc + int32_t wchar_enclose = char_enclose_; + int32_t wchar_escape = char_escape_; + int32_t wchar_field = 0; + int32_t wchar_line = 0; + int32_t wchar_zero = '\0'; + int32_t wchar_replace = 0; + OZ(extract_fisrt_wchar_from_varhcar(field_str_, wchar_field)); + OZ(extract_fisrt_wchar_from_varhcar(line_str_, wchar_line)); + OZ(ObCharset::get_replace_character(cs_type_, wchar_replace)); + // wc->mb + if (OB_ISNULL(buf = static_cast(ctx_.get_allocator().alloc(buf_len)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate buffer", K(ret), K(buf_len)); + } + if (has_enclose_) { + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_enclose, escape_printer_.enclose_, cs_type_)); + } + if (has_escape_) { + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_escape, escape_printer_.escape_, cs_type_)); + } + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_zero, escape_printer_.zero_, cs_type_)); + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_field, escape_printer_.field_terminator_, cs_type_)); + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_line, escape_printer_.line_terminator_, cs_type_)); + OZ(print_wchar_to_buf(buf, buf_len, pos, wchar_replace, escape_printer_.convert_replacer_, cs_type_)); + escape_printer_.coll_type_ = cs_type_; + escape_printer_.ignore_convert_failed_ = true; // todo@linyi provide user-defined interface + return ret; +} + +int ObSelectIntoOp::check_has_lob_or_json() +{ + int ret = OB_SUCCESS; + const ObIArray &select_exprs = MY_SPEC.select_exprs_; + for (int64_t i = 0; OB_SUCC(ret) && (!has_lob_ || !has_json_ || !has_coll_) && i < select_exprs.count(); ++i) { + if (OB_ISNULL(select_exprs.at(i))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("select expr is unexpected null", K(ret)); + } else if (ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type())) { + has_lob_ = true; + } else if (ob_is_json_tc(select_exprs.at(i)->obj_meta_.get_type())) { + has_json_ = true; + } else if (ob_is_collection_sql_type(select_exprs.at(i)->obj_meta_.get_type())) { + has_coll_ = true; + } + } + return ret; +} + +int ObSelectIntoOp::create_shared_buffer_for_data_writer() +{ + int ret = OB_SUCCESS; + shared_buf_len_ = has_lob_ ? (5 * SHARED_BUFFER_SIZE) : SHARED_BUFFER_SIZE; + if (OB_ISNULL(shared_buf_ = static_cast(ctx_.get_allocator().alloc(shared_buf_len_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate buffer", K(ret), K(shared_buf_len_)); + } + if (OB_SUCC(ret) && (has_json_ || has_coll_) && has_escape_) { + json_buf_len_ = OB_MALLOC_MIDDLE_BLOCK_SIZE; + if (OB_ISNULL(json_buf_ = static_cast(ctx_.get_allocator().alloc(json_buf_len_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate buffer", K(ret), K(json_buf_len_)); + } + } + return ret; +} + +int ObSelectIntoOp::check_secure_file_path(ObString file_name) +{ + int ret = OB_SUCCESS; + ObString file_path = file_name.split_on(file_name.reverse_find('/')); + char full_path_buf[PATH_MAX+1]; + char *actual_path = nullptr; + ObSqlString sql_str; + ObString secure_file_priv; + int64_t tenant_id = MTL_ID(); + if (OB_FAIL(sql_str.append(file_path.empty() ? "." : file_path))) { + LOG_WARN("failed to append string", K(ret)); +#ifdef _WIN32 + } else if (OB_ISNULL(actual_path = _fullpath(full_path_buf, sql_str.ptr(), PATH_MAX))) { +#else + } else if (OB_ISNULL(actual_path = realpath(sql_str.ptr(), full_path_buf))) { +#endif + ret = OB_FILE_NOT_EXIST; + LOG_WARN("file not exist", K(ret), K(sql_str)); + } else if (OB_FAIL(ObSchemaUtils::get_tenant_varchar_variable(tenant_id, + SYS_VAR_SECURE_FILE_PRIV, + ctx_.get_allocator(), + secure_file_priv))) { + LOG_WARN("fail get tenant variable", K(tenant_id), K(secure_file_priv), K(ret)); + } else if (OB_FAIL(ObResolverUtils::check_secure_path(secure_file_priv, actual_path))) { + LOG_WARN("failed to check secure path", K(ret), K(secure_file_priv)); + if (OB_ERR_NO_PRIVILEGE == ret) { + ret = OB_ERR_NO_PRIV_DIRECT_PATH_ACCESS; + LOG_ERROR("failed to check secure path", K(ret), K(secure_file_priv)); + } + } + return ret; +} + +int ObSelectIntoOp::get_data_writer_for_partition(const ObString &partition_str, + ObExternalFileWriter *&data_writer) +{ + int ret = OB_SUCCESS; + ObString partition; + ObExternalFileWriter *value = NULL; + ObCsvFileWriter *csv_data_writer = NULL; + if (OB_SUCC(partition_map_.get_refactored(partition_str, value))) { + if (OB_ISNULL(value)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else { + data_writer = value; + } + } else if (OB_UNLIKELY(OB_HASH_NOT_EXIST != ret)) { + LOG_WARN("get unexpected error", K(ret)); + } else if (curr_partition_num_ >= OB_MAX_PARTITION_NUM_ORACLE) { + ret = OB_TOO_MANY_PARTITIONS_ERROR; + LOG_WARN("too many partitions", K(ret)); + } else { + ret = OB_SUCCESS; + bool writer_added = false; + if (OB_FAIL(new_data_writer(data_writer))) { + LOG_WARN("failed to new data writer", K(ret)); + } else if (OB_ISNULL(data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { + csv_data_writer = static_cast(data_writer); + if (OB_FAIL(csv_data_writer->alloc_buf(ctx_.get_allocator(), MY_SPEC.buffer_size_))) { + LOG_WARN("failed to alloc buffer", K(ret)); + } + } + //add to hashmap + if (OB_FAIL(ret)) { + } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), + partition_str, + partition))) { + LOG_WARN("failed to write string", K(ret)); + } else if (OB_FAIL(partition_map_.set_refactored(partition, data_writer))) { + LOG_WARN("failed to add data writer to map", K(ret)); + } else { + curr_partition_num_++; + writer_added = true; + } + if (OB_FAIL(ret) && NULL != data_writer && !writer_added) { + data_writer->~ObExternalFileWriter(); + } + //calc file path + if (OB_SUCC(ret) && OB_FAIL(calc_file_path_with_partition(partition, *data_writer))) { + LOG_WARN("failed to calc file path with partition", K(ret)); + } + } + return ret; +} + +int ObSelectIntoOp::create_the_only_data_writer(ObExternalFileWriter *&data_writer) +{ + int ret = OB_SUCCESS; + ObCsvFileWriter *csv_data_writer = NULL; + if (OB_FAIL(new_data_writer(data_writer))) { + LOG_WARN("failed to new data writer", K(ret)); + } else if (OB_ISNULL(data_writer)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null", K(ret)); + } else { + data_writer->url_ = basic_url_; + data_writer_ = data_writer; + } + if (OB_FAIL(ret)) { + } else if (T_INTO_OUTFILE == MY_SPEC.into_type_ && MY_SPEC.is_single_ + && OB_FAIL(data_writer->open_file())) { + LOG_WARN("failed to open file", K(ret)); + } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { + csv_data_writer = static_cast(data_writer); + if (OB_FAIL(csv_data_writer->alloc_buf(ctx_.get_allocator(), MY_SPEC.buffer_size_))) { + LOG_WARN("failed to alloc buffer", K(ret)); + } + } + return ret; +} + +int ObSelectIntoOp::new_data_writer(ObExternalFileWriter *&data_writer) +{ + int ret = OB_SUCCESS; + void *ptr = NULL; + switch (format_type_) + { + case ObExternalFileFormat::FormatType::CSV_FORMAT: + { + if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObCsvFileWriter)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObCsvFileWriter))); + } else { + data_writer = new(ptr) ObCsvFileWriter(access_info_, file_location_, use_shared_buf_, + has_compress_, has_lob_, write_offset_); + } + break; + } + case ObExternalFileFormat::FormatType::PARQUET_FORMAT: + { +#ifdef OB_BUILD_EMBED_MODE + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet not supported in embed mode", K(ret)); +#else + if (lib::is_embed_mode()) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet not supported in embed mode", K(ret)); + } else if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObParquetFileWriter)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObParquetFileWriter))); + } else { + data_writer = new(ptr) ObParquetFileWriter(access_info_, file_location_, parquet_writer_schema_); + } +#endif + break; + } + case ObExternalFileFormat::FormatType::ORC_FORMAT: + { + ret = OB_NOT_SUPPORTED; + break; + } + default: + { + ret = OB_NOT_SUPPORTED; + LOG_WARN("not support select into type", K(format_type_)); + } + } + return ret; +} + +void ObSelectIntoOp::destroy() +{ + ObExternalFileWriter *data_writer = NULL; + if (do_partition_) { + for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); + iter != partition_map_.end(); iter++) { + if (OB_ISNULL(data_writer = iter->second)) { + } else { + data_writer->~ObExternalFileWriter(); + } + } + } else if (OB_NOT_NULL(data_writer_)) { + data_writer_->~ObExternalFileWriter(); + } +#ifndef OB_BUILD_EMBED_MODE + { + ObMallocHookAttrGuard guard(ObMemAttr(MTL_ID(), "IntoParquet")); + parquet_writer_schema_.reset(); + } +#endif + external_properties_.~ObExternalFileFormat(); + partition_map_.destroy(); + ObOperator::destroy(); +} + +#undef ARROW_FAIL +} +} +#undef CAST_FAIL diff --git a/src/sql/engine/table/ob_external_table_access_service.cpp b/src/sql/engine/table/ob_external_table_access_service.cpp index bb61fd774..485057111 100644 --- a/src/sql/engine/table/ob_external_table_access_service.cpp +++ b/src/sql/engine/table/ob_external_table_access_service.cpp @@ -1,871 +1,870 @@ /* - * Copyright (c) 2025 OceanBase. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - #define USING_LOG_PREFIX SQL - - #include "ob_external_table_access_service.h" - #include "share/backup/ob_backup_io_adapter.h" - #include "share/external_table/ob_external_table_utils.h" - #include "share/ob_device_manager.h" - #ifndef OB_BUILD_EMBED_MODE - #include "sql/engine/table/ob_parquet_table_row_iter.h" - #include "sql/engine/table/ob_orc_table_row_iter.h" - #endif - #include "sql/engine/cmd/ob_load_data_file_reader.h" - #include "sql/engine/table/ob_csv_table_row_iter.h" - #include "sql/engine/expr/ob_expr_regexp_context.h" - #include "share/config/ob_server_config.h" - - namespace oceanbase - { - namespace common { - extern const char *OB_STORAGE_ACCESS_TYPES_STR[]; - } - - namespace share - { - struct ObExternalTablePartInfo; - class ObExternalTablePartInfoArray; - } - using namespace share::schema; - using namespace common; - using namespace share; - namespace sql - { - - static constexpr uint64_t OB_STORAGE_ID_EXTERNAL = 2001; - - ObExternalDataAccessDriver::~ObExternalDataAccessDriver() { - close(); - if (OB_NOT_NULL(device_handle_)) { - ObDeviceManager::get_instance().release_device(device_handle_); - } - } - - void ObExternalDataAccessDriver::close() - { - if (OB_NOT_NULL(device_handle_) && fd_.is_valid()) { - int ret = OB_SUCCESS; - if (OB_FAIL(ObBackupIoAdapter::close_device_and_fd(device_handle_, fd_))) { - LOG_WARN("fail to close device and fd", KR(ret), K_(fd), KP_(device_handle)); - } - } - } - - bool ObExternalDataAccessDriver::is_opened() const - { - return fd_.is_valid(); - } - - int ObExternalDataAccessDriver::get_file_size(const ObString &url, int64_t &file_size) - { - int ret = OB_SUCCESS; - file_size = -1; - CONSUMER_GROUP_FUNC_GUARD(ObFunctionType::PRIO_IMPORT); - ObString url_cstring; - ObArenaAllocator allocator; - - if (OB_FAIL(ob_write_string(allocator, url, url_cstring, true/*c_style*/))) { - LOG_WARN("fail to copy string", KR(ret), K(url)); - } else if (OB_FAIL(ObBackupIoAdapter::get_file_length(url_cstring, access_info_, file_size))) { - LOG_WARN("fail to get file length", KR(ret), K(url_cstring), K_(access_info)); - } - - if (OB_OBJECT_NOT_EXIST == ret || OB_IO_ERROR == ret) { - file_size = -1; - ret = OB_SUCCESS; - } - return ret; - } - - int ObExternalDataAccessDriver::open(const char *url) - { - int ret = OB_SUCCESS; - if (OB_UNLIKELY(is_opened())) { - ret = OB_INIT_TWICE; - LOG_WARN("Data Access Driver has been opened", KR(ret), K(url)); - } else if (OB_FAIL(ObBackupIoAdapter::open_with_access_type( - device_handle_, fd_, access_info_, url, OB_STORAGE_ACCESS_READER, - ObStorageIdMod(OB_STORAGE_ID_EXTERNAL, ObStorageUsedMod::STORAGE_USED_EXTERNAL)))) { - LOG_WARN("fail to open Data Access Driver", KR(ret), K_(access_info), K(url)); - } - return ret; - } - - int ObExternalDataAccessDriver::pread(void *buf, const int64_t count, const int64_t offset, int64_t &read_size) - { - int ret = OB_SUCCESS; - ObIOHandle io_handle; - CONSUMER_GROUP_FUNC_GUARD(PRIO_IMPORT); - if (OB_FAIL(ObBackupIoAdapter::async_pread(*device_handle_, fd_, - static_cast(buf), offset, count, io_handle))) { - LOG_WARN("fail to async pread", KR(ret), - KP_(device_handle), K_(fd), KP(buf), K(offset), K(count)); - } else if (OB_FAIL(io_handle.wait())) { - LOG_WARN("fail to wait pread result", KR(ret), - KP_(device_handle), K_(fd), KP(buf), K(offset), K(count)); - } else { - read_size = io_handle.get_data_size(); - } - return ret; - } - - class ObExternalFileListArrayOpWithFilter : public ObBaseDirEntryOperator - { - public: - ObExternalFileListArrayOpWithFilter(ObIArray & name_array, - ObIArray & file_size, - ObExternalPathFilter *filter, - ObIAllocator& array_allocator) - : name_array_(name_array), file_size_(file_size), filter_(filter), allocator_(array_allocator) {} - - virtual bool need_get_file_size() const override { return true; } - int func(const dirent *entry) { - int ret = OB_SUCCESS; - if (OB_ISNULL(entry)) { - ret = OB_INVALID_ARGUMENT; - OB_LOG(WARN, "invalid list entry, entry is null"); - } else if (OB_ISNULL(entry->d_name)) { - ret = OB_INVALID_ARGUMENT; - OB_LOG(WARN, "invalid list entry, d_name is null"); - } else { - const ObString file_name(entry->d_name); - ObString tmp_file; - bool is_filtered = false; - if (!file_name.empty() && file_name[file_name.length() - 1] != '/') { - if (OB_NOT_NULL(filter_) && OB_FAIL(filter_->is_filtered(file_name, is_filtered))) { - LOG_WARN("fail check is filtered", K(ret)); - } else if (!is_filtered) { - if (OB_FAIL(ob_write_string(allocator_, file_name, tmp_file, true))) { - OB_LOG(WARN, "fail to save file name", K(ret), K(file_name)); - } else if (OB_FAIL(name_array_.push_back(tmp_file))) { - OB_LOG(WARN, "fail to push filename to array", K(ret), K(tmp_file)); - } else if (OB_FAIL(file_size_.push_back(get_size()))) { - OB_LOG(WARN, "fail to push size to array", K(ret), K(tmp_file)); - } - } - } - } - return ret; - } - - private: - ObIArray & name_array_; - ObIArray & file_size_; - ObExternalPathFilter *filter_; - ObIAllocator& allocator_; - }; - - class ObLocalFileListArrayOpWithFilter : public ObBaseDirEntryOperator - { - public: - ObLocalFileListArrayOpWithFilter(ObIArray &name_array, - ObIArray & file_size, - const ObString &path, - const ObString &origin_path, - ObExternalPathFilter *filter, - ObIAllocator &array_allocator) - : name_array_(name_array), file_size_(file_size), path_(path), origin_path_(origin_path), - filter_(filter), allocator_(array_allocator) {} - virtual bool need_get_file_size() const override { return true; } - int func(const dirent *entry) - { - int ret = OB_SUCCESS; - if (OB_ISNULL(entry)) { - ret = OB_INVALID_ARGUMENT; - OB_LOG(WARN, "invalid list entry, entry is null"); - } else if (OB_ISNULL(entry->d_name)) { - ret = OB_INVALID_ARGUMENT; - OB_LOG(WARN, "invalid list entry, d_name is null"); - } else { - const ObString file_name(entry->d_name); - ObSqlString full_path; - ObString tmp_file; - bool is_filtered = false; - ObString cur_path = path_; - if (file_name.case_compare(".") == 0 - || file_name.case_compare("..") == 0) { - //do nothing - } else if (OB_FAIL(full_path.assign(cur_path))) { - OB_LOG(WARN, "assign string failed", K(ret)); - } else if (full_path.length() > 0 && *(full_path.ptr() + full_path.length() - 1) != '/' && - OB_FAIL(full_path.append("/"))) { - OB_LOG(WARN, "append failed", K(ret)) ; - } else if (OB_FAIL(full_path.append(file_name))) { - OB_LOG(WARN, "append file name failed", K(ret)); - } else if (OB_NOT_NULL(filter_) && OB_FAIL(filter_->is_filtered(full_path.string(), is_filtered))) { - LOG_WARN("fail check is filtered", K(ret)); - } else if (!is_filtered) { - ObString target = full_path.string(); - if (!is_dir_scan()) { - target += origin_path_.length(); - if (!target.empty() && '/' == target[0]) { - target += 1; - } - } - if (target.empty()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("empty dir or name", K(full_path), K(origin_path_)); - } else if (OB_FAIL(ob_write_string(allocator_, target, tmp_file, true/*c_style*/))) { - OB_LOG(WARN, "fail to save file name", K(ret), K(file_name)); - } else if (OB_FAIL(name_array_.push_back(tmp_file))) { - OB_LOG(WARN, "fail to push filename to array", K(ret), K(tmp_file)); - } else if (OB_FAIL(file_size_.push_back(get_size()))) { - OB_LOG(WARN, "fail to push size to array", K(ret), K(tmp_file)); - } - } - } - return ret; - } - private: - ObIArray &name_array_; - ObIArray &file_size_; - const ObString &path_; - const ObString &origin_path_; - ObExternalPathFilter *filter_; - ObIAllocator &allocator_; - }; - - - int ObExternalDataAccessDriver::get_file_list(const ObString &path, - const ObString &pattern, - const ObExprRegexpSessionVariables ®exp_vars, - ObIArray &file_urls, - ObIArray &file_sizes, - ObIAllocator &allocator) - { - int ret = OB_SUCCESS; - const int64_t MAX_VISIT_COUNT = 100000; - ObExprRegexContext regexp_ctx; - ObExternalPathFilter filter(regexp_ctx, allocator); - ObString path_cstring; - CONSUMER_GROUP_FUNC_GUARD(PRIO_IMPORT); - - if (OB_UNLIKELY(!access_info_->is_valid())) { - ret = OB_NOT_INIT; - LOG_WARN("ObExternalDataAccessDriver not init", KR(ret), K_(access_info)); - } else if (!pattern.empty() && OB_FAIL(filter.init(pattern, regexp_vars))) { - LOG_WARN("fail to init filter", K(ret)); - } else if (OB_FAIL(ob_write_string(allocator, path, path_cstring, true/*c_style*/))) { - LOG_WARN("fail to copy string", KR(ret), K(path)); - } else if (get_storage_type() == OB_STORAGE_FILE) { - ObSEArray file_dirs; - bool is_dir = false; - - if (get_storage_type() == OB_STORAGE_FILE) { - ObString path_without_prifix; - path_without_prifix = path_cstring; - path_without_prifix += strlen(OB_FILE_PREFIX); - - OZ(FileDirectoryUtils::is_directory(path_without_prifix.ptr(), is_dir)); - if (!is_dir) { - LOG_INFO("external location is not a directory", - K(path_without_prifix)); - } else { - OZ(file_dirs.push_back(path_cstring)); - } - } - - ObArray useless_size; - for (int64_t i = 0; OB_SUCC(ret) && i < file_dirs.count(); i++) { - ObString file_dir = file_dirs.at(i); - ObLocalFileListArrayOpWithFilter dir_op(file_dirs, useless_size, file_dir, path_cstring, NULL, allocator); - ObLocalFileListArrayOpWithFilter file_op(file_urls, file_sizes, file_dir, path_cstring, - pattern.empty() ? NULL : &filter, allocator); - dir_op.set_dir_flag(); - if (OB_FAIL(ObBackupIoAdapter::list_files(file_dir, access_info_, file_op))) { - LOG_WARN("fail to list files", KR(ret), K(file_dir), K_(access_info)); - } else if (OB_FAIL(ObBackupIoAdapter::list_directories(file_dir, access_info_, dir_op))) { - LOG_WARN("fail to list dirs", KR(ret), K(file_dir), K_(access_info)); - } else if (file_dirs.count() + file_urls.count() > MAX_VISIT_COUNT) { - ret = OB_SIZE_OVERFLOW; - LOG_WARN("too many files and dirs to visit", K(ret)); - } - } - } else { - ObExternalFileListArrayOpWithFilter file_op(file_urls, file_sizes, pattern.empty() ? NULL : &filter, allocator); - if (OB_FAIL(ObBackupIoAdapter::list_files(path_cstring, access_info_, file_op))) { - LOG_WARN("fail to list files", KR(ret), K(path_cstring), K_(access_info)); - } - } - return ret; - } - - int ObExternalDataAccessDriver::init(const ObString &location, const ObString &access_info) - { - int ret = OB_SUCCESS; - ObStorageType device_type = OB_STORAGE_MAX_TYPE; - ObArenaAllocator temp_allocator; - ObString location_cstr; - ObString access_info_cstr; - ObBackupIoAdapter util; - - if (OB_FAIL(get_storage_type_from_path(location, device_type))) { - LOG_WARN("fail to resove storage type", K(ret)); - } else { - storage_type_ = device_type; - // Note: if device type is file, the storage info is empty. - if (device_type == OB_STORAGE_FILE || - (OB_ISNULL(access_info) || OB_LIKELY(0 == access_info.length()))) { - OZ(ob_write_string(temp_allocator, location, location_cstr, true)); - access_info_cstr.assign_ptr(&dummy_empty_char, static_cast(strlen(&dummy_empty_char))); - } else { - OZ (ob_write_string(temp_allocator, location, location_cstr, true)); - OZ (ob_write_string(temp_allocator, access_info, access_info_cstr, true)); - } - } - access_info_ = &backup_storage_info_; - if (OB_ISNULL(access_info_)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("failed to get access into", K(ret), K(device_type), K(access_info_cstr)); - } - LOG_TRACE("resolve storage into", K(ret), K(device_type), K(access_info_cstr)); - OZ (access_info_->set(device_type, access_info_cstr.ptr())); - - return ret; - } - - ObExternalStreamFileReader::~ObExternalStreamFileReader() - { - reset(); - } - - const char * ObExternalStreamFileReader::MEMORY_LABEL = "ExternalReader"; - const int64_t ObExternalStreamFileReader::COMPRESSED_DATA_BUFFER_SIZE = 2 * 1024 * 1024; - - int ObExternalStreamFileReader::init(const common::ObString &location, - const ObString &access_info, - ObCSVGeneralFormat::ObCSVCompression compression_format, - ObIAllocator &allocator) - { - int ret = OB_SUCCESS; - if (OB_NOT_NULL(allocator_)) { - ret = OB_INIT_TWICE; - } else if (OB_FAIL(data_access_driver_.init(location, access_info))) { - LOG_WARN("failed to init data access driver", K(ret), K(location), K(access_info)); - } else { - allocator_ = &allocator; - compression_format_ = compression_format; - } - return ret; - } - - int ObExternalStreamFileReader::open(const ObString &filename) - { - int ret = OB_SUCCESS; - if (OB_ISNULL(allocator_)) { - ret = OB_NOT_INIT; - } else if (data_access_driver_.is_opened()) { - ret = OB_INIT_TWICE; - } else if (OB_FAIL(data_access_driver_.open(filename.ptr()))) { - LOG_WARN("failed to open file", K(ret), K(filename)); - } else if (OB_FAIL(data_access_driver_.get_file_size(filename.ptr(), file_size_))) { - LOG_WARN("failed to get file size", K(ret), K(filename)); - } else { - is_file_end_ = false; - - ObCSVGeneralFormat::ObCSVCompression this_file_compression_format = compression_format_; - if (this_file_compression_format == ObCSVGeneralFormat::ObCSVCompression::AUTO - && OB_FAIL(compression_algorithm_from_suffix(filename, this_file_compression_format))) { - LOG_WARN("failed to dectect compression format from filename", K(ret), K(filename)); - } - - if (OB_SUCC(ret) && OB_FAIL(create_decompressor(this_file_compression_format))) { - LOG_WARN("failed to create decompressor", K(ret)); - } - } - - LOG_TRACE("open file done", K(filename), K(ret)); - return ret; - } - - void ObExternalStreamFileReader::close() - { - if (data_access_driver_.is_opened()) { - data_access_driver_.close(); - - is_file_end_ = true; - file_offset_ = 0; - file_size_ = 0; - LOG_DEBUG("close file"); - } - } - - void ObExternalStreamFileReader::reset() - { - close(); - if (OB_NOT_NULL(compressed_data_) && OB_NOT_NULL(allocator_)) { - allocator_->free(compressed_data_); - compressed_data_ = nullptr; - } - - if (OB_NOT_NULL(decompressor_)) { - ObDecompressor::destroy(decompressor_); - decompressor_ = nullptr; - } - - allocator_ = nullptr; - } - - bool ObExternalStreamFileReader::eof() - { - return is_file_end_; - } - - int ObExternalStreamFileReader::read(char *buf, int64_t buf_len, int64_t &read_size) - { - int ret = OB_SUCCESS; - read_size = 0; - - if (OB_ISNULL(buf) || buf_len <= 0) { - ret = OB_INVALID_ARGUMENT; - } else if (OB_ISNULL(decompressor_)) { - ret = read_from_driver(buf, buf_len, read_size); - is_file_end_ = file_offset_ >= file_size_; - LOG_DEBUG("read file", K(is_file_end_), K(file_offset_), K(file_size_), K(read_size)); - } else { - ret = read_decompress(buf, buf_len, read_size); - is_file_end_ = (file_offset_ >= file_size_) && (consumed_data_size_ >= compress_data_size_); - } - return ret; - } - - int ObExternalStreamFileReader::read_from_driver(char *buf, int64_t buf_len, int64_t &read_size) - { - int ret = OB_SUCCESS; - read_size = 0; - - if (OB_ISNULL(buf) || buf_len <= 0) { - ret = OB_INVALID_ARGUMENT; - } else if(OB_FAIL(data_access_driver_.pread(buf, buf_len, file_offset_, read_size))) { - LOG_WARN("failed to read data from data access driver", K(ret), K(file_offset_), K(buf_len)); - } else { - file_offset_ += read_size; - } - return ret; - } - - int ObExternalStreamFileReader::read_decompress(char *buf, int64_t buf_len, int64_t &read_size) - { - int ret = OB_SUCCESS; - read_size = 0; - - if (!data_access_driver_.is_opened()) { - ret = OB_NOT_INIT; - } else if (OB_ISNULL(buf) || buf_len <= 0) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KP(buf), K(buf_len)); - } else if (consumed_data_size_ >= compress_data_size_) { - if (file_offset_ < file_size_) { - ret = read_compressed_data(); - } else { - is_file_end_ = true; - } - } - - if (OB_SUCC(ret) && compress_data_size_ > consumed_data_size_) { - int64_t consumed_size = 0; - ret = decompressor_->decompress(compressed_data_ + consumed_data_size_, - compress_data_size_ - consumed_data_size_, - consumed_size, - buf, - buf_len, - read_size); - if (OB_FAIL(ret)) { - LOG_WARN("failed to decompress", K(ret)); - } else { - consumed_data_size_ += consumed_size; - uncompressed_size_ += read_size; - } - } - return ret; - } - - int ObExternalStreamFileReader::read_compressed_data() - { - int ret = OB_SUCCESS; - char *read_buffer = compressed_data_; - if (!data_access_driver_.is_opened()) { - ret = OB_NOT_INIT; - } else if (OB_UNLIKELY(consumed_data_size_ < compress_data_size_)) { - // backup data - const int64_t last_data_size = compress_data_size_ - consumed_data_size_; - MEMMOVE(compressed_data_, compressed_data_ + consumed_data_size_, last_data_size); - read_buffer = compressed_data_ + last_data_size; - consumed_data_size_ = 0; - compress_data_size_ = last_data_size; - } else if (consumed_data_size_ == compress_data_size_) { - consumed_data_size_ = 0; - compress_data_size_ = 0; - } - - if (OB_SUCC(ret)) { - // read data from source reader - int64_t read_size = 0; - int64_t capacity = COMPRESSED_DATA_BUFFER_SIZE - compress_data_size_; - ret = read_from_driver(read_buffer, capacity, read_size); - if (OB_SUCC(ret)) { - compress_data_size_ += read_size; - } - } - return ret; - } - - int ObExternalStreamFileReader::create_decompressor(ObCSVGeneralFormat::ObCSVCompression compression_format) - { - int ret = OB_SUCCESS; - if (OB_ISNULL(allocator_)) { - ret = OB_NOT_INIT; - } else if (compression_format == ObCSVGeneralFormat::ObCSVCompression::NONE) { - ObDecompressor::destroy(decompressor_); - decompressor_ = nullptr; - } else if (OB_NOT_NULL(decompressor_) && decompressor_->compression_format() == compression_format) { - // do nothing - } else { - if (OB_NOT_NULL(decompressor_)) { - ObDecompressor::destroy(decompressor_); - decompressor_ = nullptr; - } - - if (OB_FAIL(ObDecompressor::create(compression_format, *allocator_, decompressor_))) { - LOG_WARN("failed to create decompressor", K(ret)); - } else if (OB_ISNULL(compressed_data_) && - OB_ISNULL(compressed_data_ = (char *)allocator_->alloc(COMPRESSED_DATA_BUFFER_SIZE))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("failed to allocate memory", K(COMPRESSED_DATA_BUFFER_SIZE)); - } - } - return ret; - } - - int ObExternalTableAccessService::table_scan( - ObVTableScanParam ¶m, - ObNewRowIterator *&result) - { - ACTIVE_SESSION_FLAG_SETTER_GUARD(in_storage_read); - const share::ObLSID &ls_id = param.ls_id_; - common::ObASHTabletIdSetterGuard ash_tablet_id_guard(param.tablet_id_.id()); - ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(ls_id_, ls_id.id()); - ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(tablet_id_, param.tablet_id_.id()); - int ret = OB_SUCCESS; - ObExternalTableRowIterator* row_iter = NULL; - - auto &scan_param = static_cast(param); - - switch (param.external_file_format_.format_type_) { - case ObExternalFileFormat::CSV_FORMAT: - if (OB_ISNULL(row_iter = OB_NEWx(ObCSVTableRowIterator, (scan_param.allocator_)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("alloc memory failed", K(ret)); - } - break; - case ObExternalFileFormat::PARQUET_FORMAT: - #ifdef OB_BUILD_EMBED_MODE - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet not supported in embed mode", K(ret)); - #else - if (OB_ISNULL(row_iter = OB_NEWx(ObParquetTableRowIterator, (scan_param.allocator_)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("alloc memory failed", K(ret)); - } - #endif - break; - case ObExternalFileFormat::ODPS_FORMAT: - if (!GCONF._use_odps_jni_connector) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps cpp connector is not enabled", K(ret)); - } else { - ret = OB_NOT_SUPPORTED; - LOG_WARN("odps jni connector is not enabled", K(ret)); - } - break; - case ObExternalFileFormat::ORC_FORMAT: - ret = OB_NOT_SUPPORTED; - break; - default: - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected format", K(ret), "format", param.external_file_format_.format_type_); - } - - if (OB_SUCC(ret)) { - if (OB_FAIL(row_iter->init(&scan_param))) { - row_iter->~ObExternalTableRowIterator(); - LOG_WARN("fail to open iter", K(ret)); - } else { - result = row_iter; - } - } - - LOG_DEBUG("external table access service iter init", K(ret), "type", param.external_file_format_.format_type_); - - return ret; - } - - int ObExternalTableAccessService::table_rescan(ObVTableScanParam ¶m, ObNewRowIterator *result) - { - ACTIVE_SESSION_FLAG_SETTER_GUARD(in_storage_read); - const share::ObLSID &ls_id = param.ls_id_; - common::ObASHTabletIdSetterGuard ash_tablet_id_guard(param.tablet_id_.id()); - ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(ls_id_, ls_id.id()); - ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(tablet_id_, param.tablet_id_.id()); - int ret = OB_SUCCESS; - if (OB_ISNULL(result)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected iter", K(ret)); - } else { - switch (param.external_file_format_.format_type_) { - case ObExternalFileFormat::CSV_FORMAT: - case ObExternalFileFormat::PARQUET_FORMAT: - result->reset(); - break; - case ObExternalFileFormat::ORC_FORMAT: - ret = OB_NOT_SUPPORTED; - break; - case ObExternalFileFormat::ODPS_FORMAT: - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps table"); - LOG_WARN("not support to read odps in opensource", K(ret)); - break; - default: - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected format", K(ret), "format", param.external_file_format_.format_type_); - } - } - LOG_DEBUG("external table rescan", K(param.key_ranges_), K(param.range_array_pos_)); - return ret; - } - - int ObExternalTableAccessService::reuse_scan_iter(const bool switch_param, ObNewRowIterator *iter) - { - UNUSED(switch_param); - iter->reset(); - return OB_SUCCESS; - } - - int ObExternalTableAccessService::revert_scan_iter(ObNewRowIterator *iter) - { - int ret = OB_SUCCESS; - if (OB_ISNULL(iter)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected iter", K(ret)); - } else { - iter->~ObNewRowIterator(); - } - return ret; - } - - int ObExternalTableRowIterator::init(const ObTableScanParam *scan_param) - { - scan_param_ = scan_param; - return init_exprs(scan_param); - } - - int ObExternalTableRowIterator::gen_ip_port(ObIAllocator &allocator) - { - int ret = OB_SUCCESS; - char buf[MAX_IP_PORT_SQL_LENGTH]; - int32_t len = 0; - OZ (GCONF.self_addr_.addr_to_buffer(buf, MAX_IP_PORT_SQL_LENGTH, len)); - OZ (ob_write_string(allocator, ObString(len, buf), ip_port_)); - return ret; - } - - int ObExternalTableRowIterator::init_exprs(const storage::ObTableScanParam *scan_param) - { - int ret = OB_SUCCESS; - if (OB_ISNULL(scan_param)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("scan param is null", K(ret)); - } else { - if (scan_param->column_ids_.count() != scan_param->output_exprs_->count()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("column ids not equal to access expr", K(ret)); - } - for (int i = 0; OB_SUCC(ret) && i < scan_param->column_ids_.count(); i++) { - ObExpr *cur_expr = scan_param->output_exprs_->at(i); - switch (scan_param->column_ids_.at(i)) { - case OB_HIDDEN_LINE_NUMBER_COLUMN_ID: - line_number_expr_ = cur_expr; - break; - case OB_HIDDEN_FILE_ID_COLUMN_ID: - file_id_expr_ = cur_expr; - break; - default: - OZ (column_exprs_.push_back(cur_expr)); - break; - } - } - if (OB_SUCC(ret) && column_exprs_.count() != scan_param->ext_column_convert_exprs_->count()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("column expr not equal to convert convert expr", K(ret), - K(column_exprs_), KPC(scan_param->ext_column_convert_exprs_)); - } - } - return ret; - } - - int ObExternalTableRowIterator::fill_file_partition_expr(ObExpr *expr, ObNewRow &value, const int64_t row_count) - { - int ret = OB_SUCCESS; - ObEvalCtx &eval_ctx = scan_param_->op_->get_eval_ctx(); - ObDatum *datums = expr->locate_batch_datums(eval_ctx); - int64_t loc_idx = expr->extra_ - 1; - if (OB_UNLIKELY(loc_idx < 0 || loc_idx >= value.get_count())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("loc idx is out of range", K(loc_idx), K(value), K(ret)); - } else { - if (value.get_cell(loc_idx).is_null()) { - for (int j = 0; OB_SUCC(ret) && j < row_count; j++) { - datums[j].set_null(); - } - } else { - for (int j = 0; OB_SUCC(ret) && j < row_count; j++) { - CK (OB_NOT_NULL(datums[j].ptr_)); - OZ (datums[j].from_obj(value.get_cell(loc_idx))); - } - } - } - return ret; - } - - int ObExternalTableRowIterator::calc_file_partition_list_value(const int64_t part_id, ObIAllocator &allocator, ObNewRow &value) - { - int ret = OB_SUCCESS; - share::schema::ObSchemaGetterGuard schema_guard; - const ObTableSchema *table_schema = NULL; - const ObPartition *partition = NULL; - ObExternalFileFormat::FormatType external_table_type; - bool is_odps_external_table = false; - if (OB_ISNULL(GCTX.schema_service_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected error"); - } else if (OB_FAIL(GCTX.schema_service_->get_tenant_schema_guard( - scan_param_->tenant_id_, - schema_guard))) { - LOG_WARN("get_schema_guard failed", K(ret)); - } else if (OB_FAIL(schema_guard.get_table_schema(scan_param_->tenant_id_, scan_param_->index_id_, table_schema))) { - LOG_WARN("get table schema failed", K(ret)); - } else if (OB_ISNULL(table_schema)) { - ret = OB_TABLE_NOT_EXIST; - LOG_WARN("table not exist", K(scan_param_->index_id_), K(scan_param_->tenant_id_)); - } else if (OB_FAIL(ObSQLUtils::is_odps_external_table(table_schema, is_odps_external_table))) { - LOG_WARN("failed to check is odps external table or not", K(ret)); - } else if (table_schema->is_partitioned_table() && (table_schema->is_user_specified_partition_for_external_table() || is_odps_external_table)) { - if (OB_FAIL(table_schema->get_partition_by_part_id(part_id, CHECK_PARTITION_MODE_NORMAL, partition))) { - LOG_WARN("get partition failed", K(ret), K(part_id)); - } else if (OB_ISNULL(partition) || OB_UNLIKELY(partition->get_list_row_values().count() != 1) - || partition->get_list_row_values().at(0).get_count() != table_schema->get_partition_key_column_num()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("partition is invalid", K(ret), K(part_id)); - } else { - int64_t pos = 0; - int64_t size = partition->get_list_row_values().at(0).get_deep_copy_size(); - char *buf = (char *)allocator.alloc(size); - if (OB_ISNULL(buf)) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("allocate mem failed", K(ret)); - } - OZ (value.deep_copy(partition->get_list_row_values().at(0), buf, size, pos)); - } - } - return ret; - } - int ObExternalTableRowIterator::calc_file_part_list_value_by_array( - const int64_t part_id, ObIAllocator &allocator, - const share::ObExternalTablePartInfoArray *partition_array, ObNewRow &value) - { - int ret = OB_SUCCESS; - int64_t partition_index = OB_INVALID_INDEX; - share::ObExternalTablePartInfo partition; - - int64_t partition_num = partition_array->count(); - if (OB_ISNULL(partition_array) || partition_num <= 0) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid partition array", K(ret), K(part_id), K(partition_num)); - } - - for (int64_t i = 0; OB_SUCC(ret) && i < partition_num; i++) { - if (part_id == partition_array->at(i).part_id_) { - partition_index = i; - break; - } - } - - if (OB_SUCC(ret) && partition_index != OB_INVALID_INDEX) { - partition = partition_array->at(partition_index); - } - - if (OB_SUCC(ret)) { - if (partition_index == OB_INVALID_INDEX || partition.part_id_ != part_id) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid partition", K(ret), K(partition), K(part_id)); - } else { - int64_t pos = 0; - int64_t size = partition.list_row_value_.get_deep_copy_size(); - char *buf = (char *)allocator.alloc(size); - if (OB_ISNULL(buf)) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("allocate mem failed", K(ret)); - } - OZ (value.deep_copy(partition.list_row_value_, buf, size, pos)); - } - } - return ret; - } - - int ObExternalTableRowIterator::calc_exprs_for_rowid(const int64_t read_count, ObExternalIteratorState &state) - { - int ret = OB_SUCCESS; - ObEvalCtx &eval_ctx = scan_param_->op_->get_eval_ctx(); - if (OB_NOT_NULL(file_id_expr_)) { - OZ (file_id_expr_->init_vector_for_write(eval_ctx, VEC_FIXED, read_count)); - for (int i = 0; OB_SUCC(ret) && i < read_count; i++) { - ObFixedLengthBase *vec = static_cast(file_id_expr_->get_vector(eval_ctx)); - vec->set_int(i, state.cur_file_id_); - } - file_id_expr_->set_evaluated_flag(eval_ctx); - } - if (OB_NOT_NULL(line_number_expr_)) { - OZ (line_number_expr_->init_vector_for_write(eval_ctx, VEC_FIXED, read_count)); - for (int i = 0; OB_SUCC(ret) && i < read_count; i++) { - ObFixedLengthBase *vec = static_cast(line_number_expr_->get_vector(eval_ctx)); - vec->set_int(i, state.cur_line_number_ + i); - } - line_number_expr_->set_evaluated_flag(eval_ctx); - } - state.cur_line_number_ += read_count; - state.batch_first_row_line_num_ = state.cur_line_number_ - read_count; - return ret; - } - - DEF_TO_STRING(ObExternalIteratorState) - { - int64_t pos = 0; - J_OBJ_START(); - J_KV(K_(file_idx), - K_(part_id), - K_(cur_file_id), - K_(cur_line_number), - K_(cur_file_url), - K_(part_list_val)); - J_OBJ_END(); - return pos; - } - - - } - } - \ No newline at end of file +* Copyright (c) 2025 OceanBase. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#define USING_LOG_PREFIX SQL + +#include "ob_external_table_access_service.h" +#include "share/backup/ob_backup_io_adapter.h" +#include "share/external_table/ob_external_table_utils.h" +#include "share/ob_device_manager.h" +#ifndef OB_BUILD_EMBED_MODE +#include "sql/engine/table/ob_parquet_table_row_iter.h" +#include "sql/engine/table/ob_orc_table_row_iter.h" +#endif +#include "sql/engine/cmd/ob_load_data_file_reader.h" +#include "sql/engine/table/ob_csv_table_row_iter.h" +#include "sql/engine/expr/ob_expr_regexp_context.h" +#include "share/config/ob_server_config.h" + +namespace oceanbase +{ +namespace common { +extern const char *OB_STORAGE_ACCESS_TYPES_STR[]; +} + +namespace share +{ +struct ObExternalTablePartInfo; +class ObExternalTablePartInfoArray; +} +using namespace share::schema; +using namespace common; +using namespace share; +namespace sql +{ + +static constexpr uint64_t OB_STORAGE_ID_EXTERNAL = 2001; + +ObExternalDataAccessDriver::~ObExternalDataAccessDriver() { + close(); + if (OB_NOT_NULL(device_handle_)) { + ObDeviceManager::get_instance().release_device(device_handle_); + } +} + +void ObExternalDataAccessDriver::close() +{ + if (OB_NOT_NULL(device_handle_) && fd_.is_valid()) { + int ret = OB_SUCCESS; + if (OB_FAIL(ObBackupIoAdapter::close_device_and_fd(device_handle_, fd_))) { + LOG_WARN("fail to close device and fd", KR(ret), K_(fd), KP_(device_handle)); + } + } +} + +bool ObExternalDataAccessDriver::is_opened() const +{ + return fd_.is_valid(); +} + +int ObExternalDataAccessDriver::get_file_size(const ObString &url, int64_t &file_size) +{ + int ret = OB_SUCCESS; + file_size = -1; + CONSUMER_GROUP_FUNC_GUARD(ObFunctionType::PRIO_IMPORT); + ObString url_cstring; + ObArenaAllocator allocator; + + if (OB_FAIL(ob_write_string(allocator, url, url_cstring, true/*c_style*/))) { + LOG_WARN("fail to copy string", KR(ret), K(url)); + } else if (OB_FAIL(ObBackupIoAdapter::get_file_length(url_cstring, access_info_, file_size))) { + LOG_WARN("fail to get file length", KR(ret), K(url_cstring), K_(access_info)); + } + + if (OB_OBJECT_NOT_EXIST == ret || OB_IO_ERROR == ret) { + file_size = -1; + ret = OB_SUCCESS; + } + return ret; +} + +int ObExternalDataAccessDriver::open(const char *url) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(is_opened())) { + ret = OB_INIT_TWICE; + LOG_WARN("Data Access Driver has been opened", KR(ret), K(url)); + } else if (OB_FAIL(ObBackupIoAdapter::open_with_access_type( + device_handle_, fd_, access_info_, url, OB_STORAGE_ACCESS_READER, + ObStorageIdMod(OB_STORAGE_ID_EXTERNAL, ObStorageUsedMod::STORAGE_USED_EXTERNAL)))) { + LOG_WARN("fail to open Data Access Driver", KR(ret), K_(access_info), K(url)); + } + return ret; +} + +int ObExternalDataAccessDriver::pread(void *buf, const int64_t count, const int64_t offset, int64_t &read_size) +{ + int ret = OB_SUCCESS; + ObIOHandle io_handle; + CONSUMER_GROUP_FUNC_GUARD(PRIO_IMPORT); + if (OB_FAIL(ObBackupIoAdapter::async_pread(*device_handle_, fd_, + static_cast(buf), offset, count, io_handle))) { + LOG_WARN("fail to async pread", KR(ret), + KP_(device_handle), K_(fd), KP(buf), K(offset), K(count)); + } else if (OB_FAIL(io_handle.wait())) { + LOG_WARN("fail to wait pread result", KR(ret), + KP_(device_handle), K_(fd), KP(buf), K(offset), K(count)); + } else { + read_size = io_handle.get_data_size(); + } + return ret; +} + +class ObExternalFileListArrayOpWithFilter : public ObBaseDirEntryOperator +{ +public: + ObExternalFileListArrayOpWithFilter(ObIArray & name_array, + ObIArray & file_size, + ObExternalPathFilter *filter, + ObIAllocator& array_allocator) + : name_array_(name_array), file_size_(file_size), filter_(filter), allocator_(array_allocator) {} + + virtual bool need_get_file_size() const override { return true; } + int func(const dirent *entry) { + int ret = OB_SUCCESS; + if (OB_ISNULL(entry)) { + ret = OB_INVALID_ARGUMENT; + OB_LOG(WARN, "invalid list entry, entry is null"); + } else if (OB_ISNULL(entry->d_name)) { + ret = OB_INVALID_ARGUMENT; + OB_LOG(WARN, "invalid list entry, d_name is null"); + } else { + const ObString file_name(entry->d_name); + ObString tmp_file; + bool is_filtered = false; + if (!file_name.empty() && file_name[file_name.length() - 1] != '/') { + if (OB_NOT_NULL(filter_) && OB_FAIL(filter_->is_filtered(file_name, is_filtered))) { + LOG_WARN("fail check is filtered", K(ret)); + } else if (!is_filtered) { + if (OB_FAIL(ob_write_string(allocator_, file_name, tmp_file, true))) { + OB_LOG(WARN, "fail to save file name", K(ret), K(file_name)); + } else if (OB_FAIL(name_array_.push_back(tmp_file))) { + OB_LOG(WARN, "fail to push filename to array", K(ret), K(tmp_file)); + } else if (OB_FAIL(file_size_.push_back(get_size()))) { + OB_LOG(WARN, "fail to push size to array", K(ret), K(tmp_file)); + } + } + } + } + return ret; + } + +private: + ObIArray & name_array_; + ObIArray & file_size_; + ObExternalPathFilter *filter_; + ObIAllocator& allocator_; +}; + +class ObLocalFileListArrayOpWithFilter : public ObBaseDirEntryOperator +{ +public: + ObLocalFileListArrayOpWithFilter(ObIArray &name_array, + ObIArray & file_size, + const ObString &path, + const ObString &origin_path, + ObExternalPathFilter *filter, + ObIAllocator &array_allocator) + : name_array_(name_array), file_size_(file_size), path_(path), origin_path_(origin_path), + filter_(filter), allocator_(array_allocator) {} + virtual bool need_get_file_size() const override { return true; } + int func(const dirent *entry) + { + int ret = OB_SUCCESS; + if (OB_ISNULL(entry)) { + ret = OB_INVALID_ARGUMENT; + OB_LOG(WARN, "invalid list entry, entry is null"); + } else if (OB_ISNULL(entry->d_name)) { + ret = OB_INVALID_ARGUMENT; + OB_LOG(WARN, "invalid list entry, d_name is null"); + } else { + const ObString file_name(entry->d_name); + ObSqlString full_path; + ObString tmp_file; + bool is_filtered = false; + ObString cur_path = path_; + if (file_name.case_compare(".") == 0 + || file_name.case_compare("..") == 0) { + //do nothing + } else if (OB_FAIL(full_path.assign(cur_path))) { + OB_LOG(WARN, "assign string failed", K(ret)); + } else if (full_path.length() > 0 && *(full_path.ptr() + full_path.length() - 1) != '/' && + OB_FAIL(full_path.append("/"))) { + OB_LOG(WARN, "append failed", K(ret)) ; + } else if (OB_FAIL(full_path.append(file_name))) { + OB_LOG(WARN, "append file name failed", K(ret)); + } else if (OB_NOT_NULL(filter_) && OB_FAIL(filter_->is_filtered(full_path.string(), is_filtered))) { + LOG_WARN("fail check is filtered", K(ret)); + } else if (!is_filtered) { + ObString target = full_path.string(); + if (!is_dir_scan()) { + target += origin_path_.length(); + if (!target.empty() && '/' == target[0]) { + target += 1; + } + } + if (target.empty()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("empty dir or name", K(full_path), K(origin_path_)); + } else if (OB_FAIL(ob_write_string(allocator_, target, tmp_file, true/*c_style*/))) { + OB_LOG(WARN, "fail to save file name", K(ret), K(file_name)); + } else if (OB_FAIL(name_array_.push_back(tmp_file))) { + OB_LOG(WARN, "fail to push filename to array", K(ret), K(tmp_file)); + } else if (OB_FAIL(file_size_.push_back(get_size()))) { + OB_LOG(WARN, "fail to push size to array", K(ret), K(tmp_file)); + } + } + } + return ret; + } +private: + ObIArray &name_array_; + ObIArray &file_size_; + const ObString &path_; + const ObString &origin_path_; + ObExternalPathFilter *filter_; + ObIAllocator &allocator_; +}; + + +int ObExternalDataAccessDriver::get_file_list(const ObString &path, + const ObString &pattern, + const ObExprRegexpSessionVariables ®exp_vars, + ObIArray &file_urls, + ObIArray &file_sizes, + ObIAllocator &allocator) +{ + int ret = OB_SUCCESS; + const int64_t MAX_VISIT_COUNT = 100000; + ObExprRegexContext regexp_ctx; + ObExternalPathFilter filter(regexp_ctx, allocator); + ObString path_cstring; + CONSUMER_GROUP_FUNC_GUARD(PRIO_IMPORT); + + if (OB_UNLIKELY(!access_info_->is_valid())) { + ret = OB_NOT_INIT; + LOG_WARN("ObExternalDataAccessDriver not init", KR(ret), K_(access_info)); + } else if (!pattern.empty() && OB_FAIL(filter.init(pattern, regexp_vars))) { + LOG_WARN("fail to init filter", K(ret)); + } else if (OB_FAIL(ob_write_string(allocator, path, path_cstring, true/*c_style*/))) { + LOG_WARN("fail to copy string", KR(ret), K(path)); + } else if (get_storage_type() == OB_STORAGE_FILE) { + ObSEArray file_dirs; + bool is_dir = false; + + if (get_storage_type() == OB_STORAGE_FILE) { + ObString path_without_prifix; + path_without_prifix = path_cstring; + path_without_prifix += strlen(OB_FILE_PREFIX); + + OZ(FileDirectoryUtils::is_directory(path_without_prifix.ptr(), is_dir)); + if (!is_dir) { + LOG_INFO("external location is not a directory", + K(path_without_prifix)); + } else { + OZ(file_dirs.push_back(path_cstring)); + } + } + + ObArray useless_size; + for (int64_t i = 0; OB_SUCC(ret) && i < file_dirs.count(); i++) { + ObString file_dir = file_dirs.at(i); + ObLocalFileListArrayOpWithFilter dir_op(file_dirs, useless_size, file_dir, path_cstring, NULL, allocator); + ObLocalFileListArrayOpWithFilter file_op(file_urls, file_sizes, file_dir, path_cstring, + pattern.empty() ? NULL : &filter, allocator); + dir_op.set_dir_flag(); + if (OB_FAIL(ObBackupIoAdapter::list_files(file_dir, access_info_, file_op))) { + LOG_WARN("fail to list files", KR(ret), K(file_dir), K_(access_info)); + } else if (OB_FAIL(ObBackupIoAdapter::list_directories(file_dir, access_info_, dir_op))) { + LOG_WARN("fail to list dirs", KR(ret), K(file_dir), K_(access_info)); + } else if (file_dirs.count() + file_urls.count() > MAX_VISIT_COUNT) { + ret = OB_SIZE_OVERFLOW; + LOG_WARN("too many files and dirs to visit", K(ret)); + } + } + } else { + ObExternalFileListArrayOpWithFilter file_op(file_urls, file_sizes, pattern.empty() ? NULL : &filter, allocator); + if (OB_FAIL(ObBackupIoAdapter::list_files(path_cstring, access_info_, file_op))) { + LOG_WARN("fail to list files", KR(ret), K(path_cstring), K_(access_info)); + } + } + return ret; +} + +int ObExternalDataAccessDriver::init(const ObString &location, const ObString &access_info) +{ + int ret = OB_SUCCESS; + ObStorageType device_type = OB_STORAGE_MAX_TYPE; + ObArenaAllocator temp_allocator; + ObString location_cstr; + ObString access_info_cstr; + ObBackupIoAdapter util; + + if (OB_FAIL(get_storage_type_from_path(location, device_type))) { + LOG_WARN("fail to resove storage type", K(ret)); + } else { + storage_type_ = device_type; + // Note: if device type is file, the storage info is empty. + if (device_type == OB_STORAGE_FILE || + (OB_ISNULL(access_info) || OB_LIKELY(0 == access_info.length()))) { + OZ(ob_write_string(temp_allocator, location, location_cstr, true)); + access_info_cstr.assign_ptr(&dummy_empty_char, static_cast(strlen(&dummy_empty_char))); + } else { + OZ (ob_write_string(temp_allocator, location, location_cstr, true)); + OZ (ob_write_string(temp_allocator, access_info, access_info_cstr, true)); + } + } + access_info_ = &backup_storage_info_; + if (OB_ISNULL(access_info_)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("failed to get access into", K(ret), K(device_type), K(access_info_cstr)); + } + LOG_TRACE("resolve storage into", K(ret), K(device_type), K(access_info_cstr)); + OZ (access_info_->set(device_type, access_info_cstr.ptr())); + + return ret; +} + +ObExternalStreamFileReader::~ObExternalStreamFileReader() +{ + reset(); +} + +const char * ObExternalStreamFileReader::MEMORY_LABEL = "ExternalReader"; +const int64_t ObExternalStreamFileReader::COMPRESSED_DATA_BUFFER_SIZE = 2 * 1024 * 1024; + +int ObExternalStreamFileReader::init(const common::ObString &location, + const ObString &access_info, + ObCSVGeneralFormat::ObCSVCompression compression_format, + ObIAllocator &allocator) +{ + int ret = OB_SUCCESS; + if (OB_NOT_NULL(allocator_)) { + ret = OB_INIT_TWICE; + } else if (OB_FAIL(data_access_driver_.init(location, access_info))) { + LOG_WARN("failed to init data access driver", K(ret), K(location), K(access_info)); + } else { + allocator_ = &allocator; + compression_format_ = compression_format; + } + return ret; +} + +int ObExternalStreamFileReader::open(const ObString &filename) +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(allocator_)) { + ret = OB_NOT_INIT; + } else if (data_access_driver_.is_opened()) { + ret = OB_INIT_TWICE; + } else if (OB_FAIL(data_access_driver_.open(filename.ptr()))) { + LOG_WARN("failed to open file", K(ret), K(filename)); + } else if (OB_FAIL(data_access_driver_.get_file_size(filename.ptr(), file_size_))) { + LOG_WARN("failed to get file size", K(ret), K(filename)); + } else { + is_file_end_ = false; + + ObCSVGeneralFormat::ObCSVCompression this_file_compression_format = compression_format_; + if (this_file_compression_format == ObCSVGeneralFormat::ObCSVCompression::AUTO + && OB_FAIL(compression_algorithm_from_suffix(filename, this_file_compression_format))) { + LOG_WARN("failed to dectect compression format from filename", K(ret), K(filename)); + } + + if (OB_SUCC(ret) && OB_FAIL(create_decompressor(this_file_compression_format))) { + LOG_WARN("failed to create decompressor", K(ret)); + } + } + + LOG_TRACE("open file done", K(filename), K(ret)); + return ret; +} + +void ObExternalStreamFileReader::close() +{ + if (data_access_driver_.is_opened()) { + data_access_driver_.close(); + + is_file_end_ = true; + file_offset_ = 0; + file_size_ = 0; + LOG_DEBUG("close file"); + } +} + +void ObExternalStreamFileReader::reset() +{ + close(); + if (OB_NOT_NULL(compressed_data_) && OB_NOT_NULL(allocator_)) { + allocator_->free(compressed_data_); + compressed_data_ = nullptr; + } + + if (OB_NOT_NULL(decompressor_)) { + ObDecompressor::destroy(decompressor_); + decompressor_ = nullptr; + } + + allocator_ = nullptr; +} + +bool ObExternalStreamFileReader::eof() +{ + return is_file_end_; +} + +int ObExternalStreamFileReader::read(char *buf, int64_t buf_len, int64_t &read_size) +{ + int ret = OB_SUCCESS; + read_size = 0; + + if (OB_ISNULL(buf) || buf_len <= 0) { + ret = OB_INVALID_ARGUMENT; + } else if (OB_ISNULL(decompressor_)) { + ret = read_from_driver(buf, buf_len, read_size); + is_file_end_ = file_offset_ >= file_size_; + LOG_DEBUG("read file", K(is_file_end_), K(file_offset_), K(file_size_), K(read_size)); + } else { + ret = read_decompress(buf, buf_len, read_size); + is_file_end_ = (file_offset_ >= file_size_) && (consumed_data_size_ >= compress_data_size_); + } + return ret; +} + +int ObExternalStreamFileReader::read_from_driver(char *buf, int64_t buf_len, int64_t &read_size) +{ + int ret = OB_SUCCESS; + read_size = 0; + + if (OB_ISNULL(buf) || buf_len <= 0) { + ret = OB_INVALID_ARGUMENT; + } else if(OB_FAIL(data_access_driver_.pread(buf, buf_len, file_offset_, read_size))) { + LOG_WARN("failed to read data from data access driver", K(ret), K(file_offset_), K(buf_len)); + } else { + file_offset_ += read_size; + } + return ret; +} + +int ObExternalStreamFileReader::read_decompress(char *buf, int64_t buf_len, int64_t &read_size) +{ + int ret = OB_SUCCESS; + read_size = 0; + + if (!data_access_driver_.is_opened()) { + ret = OB_NOT_INIT; + } else if (OB_ISNULL(buf) || buf_len <= 0) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KP(buf), K(buf_len)); + } else if (consumed_data_size_ >= compress_data_size_) { + if (file_offset_ < file_size_) { + ret = read_compressed_data(); + } else { + is_file_end_ = true; + } + } + + if (OB_SUCC(ret) && compress_data_size_ > consumed_data_size_) { + int64_t consumed_size = 0; + ret = decompressor_->decompress(compressed_data_ + consumed_data_size_, + compress_data_size_ - consumed_data_size_, + consumed_size, + buf, + buf_len, + read_size); + if (OB_FAIL(ret)) { + LOG_WARN("failed to decompress", K(ret)); + } else { + consumed_data_size_ += consumed_size; + uncompressed_size_ += read_size; + } + } + return ret; +} + +int ObExternalStreamFileReader::read_compressed_data() +{ + int ret = OB_SUCCESS; + char *read_buffer = compressed_data_; + if (!data_access_driver_.is_opened()) { + ret = OB_NOT_INIT; + } else if (OB_UNLIKELY(consumed_data_size_ < compress_data_size_)) { + // backup data + const int64_t last_data_size = compress_data_size_ - consumed_data_size_; + MEMMOVE(compressed_data_, compressed_data_ + consumed_data_size_, last_data_size); + read_buffer = compressed_data_ + last_data_size; + consumed_data_size_ = 0; + compress_data_size_ = last_data_size; + } else if (consumed_data_size_ == compress_data_size_) { + consumed_data_size_ = 0; + compress_data_size_ = 0; + } + + if (OB_SUCC(ret)) { + // read data from source reader + int64_t read_size = 0; + int64_t capacity = COMPRESSED_DATA_BUFFER_SIZE - compress_data_size_; + ret = read_from_driver(read_buffer, capacity, read_size); + if (OB_SUCC(ret)) { + compress_data_size_ += read_size; + } + } + return ret; +} + +int ObExternalStreamFileReader::create_decompressor(ObCSVGeneralFormat::ObCSVCompression compression_format) +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(allocator_)) { + ret = OB_NOT_INIT; + } else if (compression_format == ObCSVGeneralFormat::ObCSVCompression::NONE) { + ObDecompressor::destroy(decompressor_); + decompressor_ = nullptr; + } else if (OB_NOT_NULL(decompressor_) && decompressor_->compression_format() == compression_format) { + // do nothing + } else { + if (OB_NOT_NULL(decompressor_)) { + ObDecompressor::destroy(decompressor_); + decompressor_ = nullptr; + } + + if (OB_FAIL(ObDecompressor::create(compression_format, *allocator_, decompressor_))) { + LOG_WARN("failed to create decompressor", K(ret)); + } else if (OB_ISNULL(compressed_data_) && + OB_ISNULL(compressed_data_ = (char *)allocator_->alloc(COMPRESSED_DATA_BUFFER_SIZE))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("failed to allocate memory", K(COMPRESSED_DATA_BUFFER_SIZE)); + } + } + return ret; +} + +int ObExternalTableAccessService::table_scan( + ObVTableScanParam ¶m, + ObNewRowIterator *&result) +{ + ACTIVE_SESSION_FLAG_SETTER_GUARD(in_storage_read); + const share::ObLSID &ls_id = param.ls_id_; + common::ObASHTabletIdSetterGuard ash_tablet_id_guard(param.tablet_id_.id()); + ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(ls_id_, ls_id.id()); + ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(tablet_id_, param.tablet_id_.id()); + int ret = OB_SUCCESS; + ObExternalTableRowIterator* row_iter = NULL; + + auto &scan_param = static_cast(param); + + switch (param.external_file_format_.format_type_) { + case ObExternalFileFormat::CSV_FORMAT: + if (OB_ISNULL(row_iter = OB_NEWx(ObCSVTableRowIterator, (scan_param.allocator_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(ret)); + } + break; + case ObExternalFileFormat::PARQUET_FORMAT: +#ifdef OB_BUILD_EMBED_MODE + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet not supported in embed mode", K(ret)); +#else + if (OB_ISNULL(row_iter = OB_NEWx(ObParquetTableRowIterator, (scan_param.allocator_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(ret)); + } +#endif + break; + case ObExternalFileFormat::ODPS_FORMAT: + if (!GCONF._use_odps_jni_connector) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps cpp connector is not enabled", K(ret)); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("odps jni connector is not enabled", K(ret)); + } + break; + case ObExternalFileFormat::ORC_FORMAT: + ret = OB_NOT_SUPPORTED; + break; + default: + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected format", K(ret), "format", param.external_file_format_.format_type_); + } + + if (OB_SUCC(ret)) { + if (OB_FAIL(row_iter->init(&scan_param))) { + row_iter->~ObExternalTableRowIterator(); + LOG_WARN("fail to open iter", K(ret)); + } else { + result = row_iter; + } + } + + LOG_DEBUG("external table access service iter init", K(ret), "type", param.external_file_format_.format_type_); + + return ret; +} + +int ObExternalTableAccessService::table_rescan(ObVTableScanParam ¶m, ObNewRowIterator *result) +{ + ACTIVE_SESSION_FLAG_SETTER_GUARD(in_storage_read); + const share::ObLSID &ls_id = param.ls_id_; + common::ObASHTabletIdSetterGuard ash_tablet_id_guard(param.tablet_id_.id()); + ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(ls_id_, ls_id.id()); + ACTIVE_SESSION_RETRY_DIAG_INFO_SETTER(tablet_id_, param.tablet_id_.id()); + int ret = OB_SUCCESS; + if (OB_ISNULL(result)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected iter", K(ret)); + } else { + switch (param.external_file_format_.format_type_) { + case ObExternalFileFormat::CSV_FORMAT: + case ObExternalFileFormat::PARQUET_FORMAT: + result->reset(); + break; + case ObExternalFileFormat::ORC_FORMAT: + ret = OB_NOT_SUPPORTED; + break; + case ObExternalFileFormat::ODPS_FORMAT: + ret = OB_NOT_SUPPORTED; + LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps table"); + LOG_WARN("not support to read odps in opensource", K(ret)); + break; + default: + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected format", K(ret), "format", param.external_file_format_.format_type_); + } + } + LOG_DEBUG("external table rescan", K(param.key_ranges_), K(param.range_array_pos_)); + return ret; +} + +int ObExternalTableAccessService::reuse_scan_iter(const bool switch_param, ObNewRowIterator *iter) +{ + UNUSED(switch_param); + iter->reset(); + return OB_SUCCESS; +} + +int ObExternalTableAccessService::revert_scan_iter(ObNewRowIterator *iter) +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(iter)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected iter", K(ret)); + } else { + iter->~ObNewRowIterator(); + } + return ret; +} + +int ObExternalTableRowIterator::init(const ObTableScanParam *scan_param) +{ + scan_param_ = scan_param; + return init_exprs(scan_param); +} + +int ObExternalTableRowIterator::gen_ip_port(ObIAllocator &allocator) +{ + int ret = OB_SUCCESS; + char buf[MAX_IP_PORT_SQL_LENGTH]; + int32_t len = 0; + OZ (GCONF.self_addr_.addr_to_buffer(buf, MAX_IP_PORT_SQL_LENGTH, len)); + OZ (ob_write_string(allocator, ObString(len, buf), ip_port_)); + return ret; +} + +int ObExternalTableRowIterator::init_exprs(const storage::ObTableScanParam *scan_param) +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(scan_param)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("scan param is null", K(ret)); + } else { + if (scan_param->column_ids_.count() != scan_param->output_exprs_->count()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("column ids not equal to access expr", K(ret)); + } + for (int i = 0; OB_SUCC(ret) && i < scan_param->column_ids_.count(); i++) { + ObExpr *cur_expr = scan_param->output_exprs_->at(i); + switch (scan_param->column_ids_.at(i)) { + case OB_HIDDEN_LINE_NUMBER_COLUMN_ID: + line_number_expr_ = cur_expr; + break; + case OB_HIDDEN_FILE_ID_COLUMN_ID: + file_id_expr_ = cur_expr; + break; + default: + OZ (column_exprs_.push_back(cur_expr)); + break; + } + } + if (OB_SUCC(ret) && column_exprs_.count() != scan_param->ext_column_convert_exprs_->count()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("column expr not equal to convert convert expr", K(ret), + K(column_exprs_), KPC(scan_param->ext_column_convert_exprs_)); + } + } + return ret; +} + +int ObExternalTableRowIterator::fill_file_partition_expr(ObExpr *expr, ObNewRow &value, const int64_t row_count) +{ + int ret = OB_SUCCESS; + ObEvalCtx &eval_ctx = scan_param_->op_->get_eval_ctx(); + ObDatum *datums = expr->locate_batch_datums(eval_ctx); + int64_t loc_idx = expr->extra_ - 1; + if (OB_UNLIKELY(loc_idx < 0 || loc_idx >= value.get_count())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("loc idx is out of range", K(loc_idx), K(value), K(ret)); + } else { + if (value.get_cell(loc_idx).is_null()) { + for (int j = 0; OB_SUCC(ret) && j < row_count; j++) { + datums[j].set_null(); + } + } else { + for (int j = 0; OB_SUCC(ret) && j < row_count; j++) { + CK (OB_NOT_NULL(datums[j].ptr_)); + OZ (datums[j].from_obj(value.get_cell(loc_idx))); + } + } + } + return ret; +} + +int ObExternalTableRowIterator::calc_file_partition_list_value(const int64_t part_id, ObIAllocator &allocator, ObNewRow &value) +{ + int ret = OB_SUCCESS; + share::schema::ObSchemaGetterGuard schema_guard; + const ObTableSchema *table_schema = NULL; + const ObPartition *partition = NULL; + ObExternalFileFormat::FormatType external_table_type; + bool is_odps_external_table = false; + if (OB_ISNULL(GCTX.schema_service_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected error"); + } else if (OB_FAIL(GCTX.schema_service_->get_tenant_schema_guard( + scan_param_->tenant_id_, + schema_guard))) { + LOG_WARN("get_schema_guard failed", K(ret)); + } else if (OB_FAIL(schema_guard.get_table_schema(scan_param_->tenant_id_, scan_param_->index_id_, table_schema))) { + LOG_WARN("get table schema failed", K(ret)); + } else if (OB_ISNULL(table_schema)) { + ret = OB_TABLE_NOT_EXIST; + LOG_WARN("table not exist", K(scan_param_->index_id_), K(scan_param_->tenant_id_)); + } else if (OB_FAIL(ObSQLUtils::is_odps_external_table(table_schema, is_odps_external_table))) { + LOG_WARN("failed to check is odps external table or not", K(ret)); + } else if (table_schema->is_partitioned_table() && (table_schema->is_user_specified_partition_for_external_table() || is_odps_external_table)) { + if (OB_FAIL(table_schema->get_partition_by_part_id(part_id, CHECK_PARTITION_MODE_NORMAL, partition))) { + LOG_WARN("get partition failed", K(ret), K(part_id)); + } else if (OB_ISNULL(partition) || OB_UNLIKELY(partition->get_list_row_values().count() != 1) + || partition->get_list_row_values().at(0).get_count() != table_schema->get_partition_key_column_num()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("partition is invalid", K(ret), K(part_id)); + } else { + int64_t pos = 0; + int64_t size = partition->get_list_row_values().at(0).get_deep_copy_size(); + char *buf = (char *)allocator.alloc(size); + if (OB_ISNULL(buf)) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("allocate mem failed", K(ret)); + } + OZ (value.deep_copy(partition->get_list_row_values().at(0), buf, size, pos)); + } + } + return ret; +} +int ObExternalTableRowIterator::calc_file_part_list_value_by_array( + const int64_t part_id, ObIAllocator &allocator, + const share::ObExternalTablePartInfoArray *partition_array, ObNewRow &value) +{ + int ret = OB_SUCCESS; + int64_t partition_index = OB_INVALID_INDEX; + share::ObExternalTablePartInfo partition; + + int64_t partition_num = partition_array->count(); + if (OB_ISNULL(partition_array) || partition_num <= 0) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid partition array", K(ret), K(part_id), K(partition_num)); + } + + for (int64_t i = 0; OB_SUCC(ret) && i < partition_num; i++) { + if (part_id == partition_array->at(i).part_id_) { + partition_index = i; + break; + } + } + + if (OB_SUCC(ret) && partition_index != OB_INVALID_INDEX) { + partition = partition_array->at(partition_index); + } + + if (OB_SUCC(ret)) { + if (partition_index == OB_INVALID_INDEX || partition.part_id_ != part_id) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid partition", K(ret), K(partition), K(part_id)); + } else { + int64_t pos = 0; + int64_t size = partition.list_row_value_.get_deep_copy_size(); + char *buf = (char *)allocator.alloc(size); + if (OB_ISNULL(buf)) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("allocate mem failed", K(ret)); + } + OZ (value.deep_copy(partition.list_row_value_, buf, size, pos)); + } + } + return ret; +} + +int ObExternalTableRowIterator::calc_exprs_for_rowid(const int64_t read_count, ObExternalIteratorState &state) +{ + int ret = OB_SUCCESS; + ObEvalCtx &eval_ctx = scan_param_->op_->get_eval_ctx(); + if (OB_NOT_NULL(file_id_expr_)) { + OZ (file_id_expr_->init_vector_for_write(eval_ctx, VEC_FIXED, read_count)); + for (int i = 0; OB_SUCC(ret) && i < read_count; i++) { + ObFixedLengthBase *vec = static_cast(file_id_expr_->get_vector(eval_ctx)); + vec->set_int(i, state.cur_file_id_); + } + file_id_expr_->set_evaluated_flag(eval_ctx); + } + if (OB_NOT_NULL(line_number_expr_)) { + OZ (line_number_expr_->init_vector_for_write(eval_ctx, VEC_FIXED, read_count)); + for (int i = 0; OB_SUCC(ret) && i < read_count; i++) { + ObFixedLengthBase *vec = static_cast(line_number_expr_->get_vector(eval_ctx)); + vec->set_int(i, state.cur_line_number_ + i); + } + line_number_expr_->set_evaluated_flag(eval_ctx); + } + state.cur_line_number_ += read_count; + state.batch_first_row_line_num_ = state.cur_line_number_ - read_count; + return ret; +} + +DEF_TO_STRING(ObExternalIteratorState) +{ + int64_t pos = 0; + J_OBJ_START(); + J_KV(K_(file_idx), + K_(part_id), + K_(cur_file_id), + K_(cur_line_number), + K_(cur_file_url), + K_(part_list_val)); + J_OBJ_END(); + return pos; +} + + +} +} From a92f7b75ea55c58187b331c5a7619b1b8fe50bc7 Mon Sep 17 00:00:00 2001 From: hnwyllmm Date: Wed, 20 May 2026 11:25:31 +0800 Subject: [PATCH 3/6] fixed tab --- src/sql/engine/basic/ob_select_into_op.cpp | 30 +++++++++---------- .../ob_external_table_access_service.cpp | 30 +++++++++---------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/sql/engine/basic/ob_select_into_op.cpp b/src/sql/engine/basic/ob_select_into_op.cpp index 20295c9ee..5c03824ea 100644 --- a/src/sql/engine/basic/ob_select_into_op.cpp +++ b/src/sql/engine/basic/ob_select_into_op.cpp @@ -1,18 +1,18 @@ -/* -* Copyright (c) 2025 OceanBase. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + /* + * Copyright (c) 2025 OceanBase. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #define USING_LOG_PREFIX SQL_ENG diff --git a/src/sql/engine/table/ob_external_table_access_service.cpp b/src/sql/engine/table/ob_external_table_access_service.cpp index 485057111..2e5fafae4 100644 --- a/src/sql/engine/table/ob_external_table_access_service.cpp +++ b/src/sql/engine/table/ob_external_table_access_service.cpp @@ -1,18 +1,18 @@ -/* -* Copyright (c) 2025 OceanBase. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + /* + * Copyright (c) 2025 OceanBase. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #define USING_LOG_PREFIX SQL From e8f9c3f638208a3bf6df0cf315efaeb260daf5ba Mon Sep 17 00:00:00 2001 From: hnwyllmm Date: Wed, 20 May 2026 11:30:07 +0800 Subject: [PATCH 4/6] fixed tab --- src/sql/engine/basic/ob_select_into_op.cpp | 288 +++++++++--------- .../ob_external_table_access_service.cpp | 68 ++--- 2 files changed, 178 insertions(+), 178 deletions(-) diff --git a/src/sql/engine/basic/ob_select_into_op.cpp b/src/sql/engine/basic/ob_select_into_op.cpp index 5c03824ea..6d2ecbae3 100644 --- a/src/sql/engine/basic/ob_select_into_op.cpp +++ b/src/sql/engine/basic/ob_select_into_op.cpp @@ -1,18 +1,18 @@ - /* - * Copyright (c) 2025 OceanBase. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +/* + * Copyright (c) 2025 OceanBase. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #define USING_LOG_PREFIX SQL_ENG @@ -209,9 +209,9 @@ int ObSelectIntoOp::init_env_common() ret = OB_ERR_UNEXPECTED; LOG_WARN("get phy_plan_ctx failed", K(ret)); } else if (OB_FAIL(ObSQLUtils::get_param_value(MY_SPEC.outfile_name_, - phy_plan_ctx->get_param_store(), - file_name_, - need_check))) { + phy_plan_ctx->get_param_store(), + file_name_, + need_check))) { LOG_WARN("get param value failed", K(ret)); } else if (OB_FAIL(calc_url_and_set_access_info())) { LOG_WARN("failed to calc basic url and set device handle", K(ret)); @@ -221,7 +221,7 @@ int ObSelectIntoOp::init_env_common() ret = OB_NOT_SUPPORTED; LOG_USER_ERROR(OB_NOT_SUPPORTED, "select array/map into variables"); } else if (do_partition_ - && OB_FAIL(partition_map_.create(128, ObLabel("SelectInto"), ObLabel("SelectInto"), MTL_ID()))) { + && OB_FAIL(partition_map_.create(128, ObLabel("SelectInto"), ObLabel("SelectInto"), MTL_ID()))) { LOG_WARN("failed to create hashmap", K(ret)); } else if (MY_SPEC.select_exprs_.count() != MY_SPEC.alias_names_.strs_.count()) { ret = OB_ERR_UNEXPECTED; @@ -325,7 +325,7 @@ int ObSelectIntoOp::inner_get_next_row() if (is_odps_cpp_table_ == is_odps_java_table_) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid table mode for odps table", K(ret), - K(is_odps_cpp_table_), K(is_odps_java_table_)); + K(is_odps_cpp_table_), K(is_odps_java_table_)); } else if (is_odps_cpp_table_) { ret = OB_NOT_SUPPORTED; LOG_USER_ERROR(OB_NOT_SUPPORTED, "external odps cpp table"); @@ -500,7 +500,7 @@ int ObSelectIntoOp::inner_close() } } else if (do_partition_) { for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); - OB_SUCC(ret) && iter != partition_map_.end(); iter++) { + OB_SUCC(ret) && iter != partition_map_.end(); iter++) { if (OB_ISNULL(data_writer = iter->second)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("data writer is unexpected null", K(ret)); @@ -525,10 +525,10 @@ int ObSelectIntoOp::get_row_str(const int64_t buf_len, //before 4_1 use output //after 4_1 use select exprs const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? - MY_SPEC.output_ : MY_SPEC.select_exprs_; + MY_SPEC.output_ : MY_SPEC.select_exprs_; if (!is_first_row && line_str_.is_varying_len_char_type()) { // lines terminated by "a" ret = databuff_printf(buf, buf_len, pos, "%.*s", line_str_.get_varchar().length(), - line_str_.get_varchar().ptr()); + line_str_.get_varchar().ptr()); } for (int i = 0 ; OB_SUCC(ret) && i < select_exprs.count() ; i++) { @@ -572,8 +572,8 @@ int ObSelectIntoOp::calc_first_file_path(ObString &path) ObString file_extension; ObSelectIntoOpInput *input = static_cast(input_); ObString input_file_name = file_location_ != IntoFileLocation::SERVER_DISK - ? path.split_on('?').trim() - : path; + ? path.split_on('?').trim() + : path; if (OB_ISNULL(input)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("op input is null", K(ret)); @@ -681,13 +681,13 @@ int ObSelectIntoOp::calc_file_path_with_partition(ObString partition, ObExternal } else if (partition.length() == 0 && OB_FAIL(url_with_partition.append("__NULL__/"))) { LOG_WARN("failed to append string", K(ret)); } else if (OB_FAIL(url_with_partition.append_fmt("%.*s", - data_writer.url_.length(), - data_writer.url_.ptr()))) { + data_writer.url_.length(), + data_writer.url_.ptr()))) { LOG_WARN("failed to append string", K(ret)); } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), - url_with_partition.string(), - data_writer.url_, - true))) { + url_with_partition.string(), + data_writer.url_, + true))) { LOG_WARN("failed to write string", K(ret)); } } @@ -773,9 +773,9 @@ int ObSelectIntoOp::get_buf(char* &buf, int64_t &buf_len, int64_t &pos, ObCsvFil } int ObSelectIntoOp::use_shared_buf(ObCsvFileWriter &data_writer, - char* &buf, - int64_t &buf_len, - int64_t &pos) + char* &buf, + int64_t &buf_len, + int64_t &pos) { int ret = OB_SUCCESS; int64_t curr_pos = data_writer.get_curr_pos(); @@ -795,10 +795,10 @@ int ObSelectIntoOp::use_shared_buf(ObCsvFileWriter &data_writer, } int ObSelectIntoOp::resize_buf(char* &buf, - int64_t &buf_len, - int64_t &pos, - int64_t curr_pos, - bool is_json) + int64_t &buf_len, + int64_t &pos, + int64_t curr_pos, + bool is_json) { int ret = OB_SUCCESS; int64_t new_buf_len = buf_len * 2; @@ -825,9 +825,9 @@ int ObSelectIntoOp::resize_buf(char* &buf, } int ObSelectIntoOp::resize_or_flush_shared_buf(ObCsvFileWriter &data_writer, - char* &buf, - int64_t &buf_len, - int64_t &pos) + char* &buf, + int64_t &buf_len, + int64_t &pos) { int ret = OB_SUCCESS; if (!use_shared_buf_) { @@ -846,10 +846,10 @@ int ObSelectIntoOp::resize_or_flush_shared_buf(ObCsvFileWriter &data_writer, } int ObSelectIntoOp::check_buf_sufficient(ObCsvFileWriter &data_writer, - char* &buf, - int64_t &buf_len, - int64_t &pos, - int64_t str_len) + char* &buf, + int64_t &buf_len, + int64_t &pos, + int64_t str_len) { int ret = OB_SUCCESS; if (buf_len < str_len * 1.1) { @@ -890,11 +890,11 @@ int ObSelectIntoOp::print_str_or_json_with_escape(const ObObj &obj, ObCsvFileWri ObCharsetType src_type = ObCharset::charset_type_by_coll(obj.get_collation_type()); ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type - || src_type == CHARSET_INVALID); + || src_type == CHARSET_INVALID); escape_printer_.need_enclose_ = has_enclose_ && !obj.is_null(); escape_printer_.do_escape_ = true; escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY - && print_params_.binary_string_print_hex_; + && print_params_.binary_string_print_hex_; ObString str_to_escape; ObEvalCtx::TempAllocGuard tmp_alloc_g(eval_ctx_); common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); @@ -924,10 +924,10 @@ int ObSelectIntoOp::print_str_or_json_with_escape(const ObObj &obj, ObCsvFileWri str_to_escape = obj.get_varchar(); } if (OB_SUCC(ret) && !use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_, - str_to_escape.length()))) { + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_, + str_to_escape.length()))) { LOG_WARN("failed to check if buf is sufficient", K(ret)); } if (OB_SUCC(ret) && !use_shared_buf_) { @@ -942,10 +942,10 @@ int ObSelectIntoOp::print_str_or_json_with_escape(const ObObj &obj, ObCsvFileWri LOG_WARN("failed to flush buffer", K(ret)); } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { } else if (OB_FAIL(ObFastStringScanner::foreach_char(str_to_escape, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_))) { + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_))) { if (OB_SIZE_OVERFLOW != ret) { LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); } else if (OB_FAIL(use_shared_buf(data_writer, @@ -967,9 +967,9 @@ int ObSelectIntoOp::print_str_or_json_with_escape(const ObObj &obj, ObCsvFileWri LOG_WARN("failed to print plain str", K(ret), K(src_type), K(escape_printer_.do_encode_)); } } while (OB_SIZE_OVERFLOW == ret && OB_SUCC(resize_or_flush_shared_buf(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_))); + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_))); if (OB_FAIL(ret)) { LOG_WARN("failed to print plain str", K(ret)); } @@ -1010,7 +1010,7 @@ int ObSelectIntoOp::print_normal_obj_without_escape(const ObObj &obj, ObCsvFileW LOG_WARN("failed to print obj", K(ret)); } } while (OB_SIZE_OVERFLOW == ret - && OB_SUCC(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))); + && OB_SUCC(resize_or_flush_shared_buf(data_writer, buf, buf_len, pos))); if (OB_FAIL(ret)) { LOG_WARN("failed to print obj", K(ret)); } @@ -1022,10 +1022,10 @@ int ObSelectIntoOp::print_normal_obj_without_escape(const ObObj &obj, ObCsvFileW } int ObSelectIntoOp::print_json_to_json_buf(const ObObj &obj, - char* &buf, - int64_t &buf_len, - int64_t &pos, - ObCsvFileWriter &data_writer) + char* &buf, + int64_t &buf_len, + int64_t &pos, + ObCsvFileWriter &data_writer) { int ret = OB_SUCCESS; buf = get_json_buf(); @@ -1036,7 +1036,7 @@ int ObSelectIntoOp::print_json_to_json_buf(const ObObj &obj, LOG_WARN("failed to print obj", K(ret)); } } while (OB_SIZE_OVERFLOW == ret - && OB_SUCC(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos(), true))); + && OB_SUCC(resize_buf(buf, buf_len, pos, data_writer.get_curr_pos(), true))); if (OB_FAIL(ret)) { LOG_WARN("failed to print json to json buffer", K(ret)); } @@ -1053,10 +1053,10 @@ int ObSelectIntoOp::write_lob_to_file(const ObObj &obj, ObCharsetType dst_type = ObCharset::charset_type_by_coll(cs_type_); escape_printer_.need_enclose_ = has_enclose_; escape_printer_.do_encode_ = !(src_type == CHARSET_BINARY || src_type == dst_type - || src_type == CHARSET_INVALID); + || src_type == CHARSET_INVALID); escape_printer_.do_escape_ = has_escape_; escape_printer_.print_hex_ = obj.get_collation_type() == CS_TYPE_BINARY - && print_params_.binary_string_print_hex_; + && print_params_.binary_string_print_hex_; ObDatumMeta input_meta = expr.datum_meta_; ObTextStringIterState state; ObString src_block_data; @@ -1071,14 +1071,14 @@ int ObSelectIntoOp::write_lob_to_file(const ObObj &obj, // When truncated_len == src_block_data.length() when truncated length equals source block data length // Indicates that the current foreach_char is processing only invalid data at the end of the lob, i.e., truncated data from the previous round, to avoid infinite loops while (OB_SUCC(ret) - && (state = lob_iter.get_next_block(src_block_data)) == TEXTSTRING_ITER_NEXT) { + && (state = lob_iter.get_next_block(src_block_data)) == TEXTSTRING_ITER_NEXT) { // outrow lob will only be false on the last iteration, inrow lob iterates only once, and is false stop_when_truncated = (truncated_len != src_block_data.length()) && lob_iter.is_outrow_lob(); if (!use_shared_buf_ && OB_FAIL(check_buf_sufficient(data_writer, - escape_printer_.buf_, - escape_printer_.buf_len_, - escape_printer_.pos_, - src_block_data.length()))) { + escape_printer_.buf_, + escape_printer_.buf_len_, + escape_printer_.pos_, + src_block_data.length()))) { LOG_WARN("failed to check if buf is sufficient", K(ret)); } if (OB_SUCC(ret) && !use_shared_buf_) { @@ -1098,12 +1098,12 @@ int ObSelectIntoOp::write_lob_to_file(const ObObj &obj, LOG_WARN("failed to flush buffer", K(ret)); } else if (OB_FALSE_IT(escape_printer_.pos_ = data_writer.get_curr_pos())) { } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { lob_iter.set_reserved_byte_len(truncated_len); ret = OB_SUCCESS; @@ -1135,12 +1135,12 @@ int ObSelectIntoOp::write_lob_to_file(const ObObj &obj, LOG_WARN("failed to flush shared buffer", K(ret)); } else if (OB_FALSE_IT(escape_printer_.pos_ = 0)) { } else if (OB_FAIL(ObFastStringScanner::foreach_char(src_block_data, - src_type, - escape_printer_, - escape_printer_.do_encode_, - escape_printer_.ignore_convert_failed_, - stop_when_truncated, - &truncated_len))) { + src_type, + escape_printer_, + escape_printer_.do_encode_, + escape_printer_.ignore_convert_failed_, + stop_when_truncated, + &truncated_len))) { if (OB_ERR_DATA_TRUNCATED == ret && stop_when_truncated) { lob_iter.set_reserved_byte_len(truncated_len); ret = OB_SUCCESS; @@ -1276,8 +1276,8 @@ int ObSelectIntoOp::into_outfile(ObExternalFileWriter *data_writer) ret = OB_ERR_UNEXPECTED; LOG_WARN("datum is unexpected null", K(ret)); } else if (OB_FAIL(datum->to_obj(obj, - select_exprs.at(i)->obj_meta_, - select_exprs.at(i)->obj_datum_map_))) { + select_exprs.at(i)->obj_meta_, + select_exprs.at(i)->obj_datum_map_))) { LOG_WARN("failed to get obj from datum", K(ret)); } else if (!ob_is_text_tc(select_exprs.at(i)->obj_meta_.get_type()) || obj.is_null()) { OZ(print_field(obj, *csv_data_writer)); @@ -1325,7 +1325,7 @@ int ObSelectIntoOp::decimal_to_string(const ObDatum &datum, ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("failed to alloc memory", K(ret)); } else if (OB_FAIL(wide::to_string(datum.get_decimal_int(), datum.get_int_bytes(), datum_meta.scale_, - buf, OB_CAST_TO_VARCHAR_MAX_LENGTH, pos))) { + buf, OB_CAST_TO_VARCHAR_MAX_LENGTH, pos))) { LOG_WARN("failed to get string", K(ret)); } else { res.assign(buf, pos); @@ -1371,7 +1371,7 @@ int ObSelectIntoOp::into_outfile_batch_csv(const ObBatchRows &brs, ObExternalFil ret = OB_ERR_UNEXPECTED; LOG_WARN("get unexpected null data writer", K(ret)); } else if (has_compress_ && OB_ISNULL(csv_data_writer->get_compress_stream_writer()) - && OB_FAIL(csv_data_writer->init_compress_writer(ctx_.get_allocator(), + && OB_FAIL(csv_data_writer->init_compress_writer(ctx_.get_allocator(), external_properties_.csv_format_.compression_algorithm_, MY_SPEC.buffer_size_))) { LOG_WARN("failed to init compress stream writer", K(ret)); @@ -1381,8 +1381,8 @@ int ObSelectIntoOp::into_outfile_batch_csv(const ObBatchRows &brs, ObExternalFil ret = OB_ERR_UNEXPECTED; LOG_WARN("datum is unexpected null", K(ret)); } else if (OB_FAIL(datum->to_obj(obj, - select_exprs.at(col_idx)->obj_meta_, - select_exprs.at(col_idx)->obj_datum_map_))) { + select_exprs.at(col_idx)->obj_meta_, + select_exprs.at(col_idx)->obj_datum_map_))) { LOG_WARN("failed to get obj from datum", K(ret)); } else if (!ob_is_text_tc(select_exprs.at(col_idx)->obj_meta_.get_type()) || obj.is_null()) { OZ(print_field(obj, *csv_data_writer)); @@ -1446,7 +1446,7 @@ int ObSelectIntoOp::get_data_from_expr_vector(const common::ObIVector* expr_vect break; case ObMySQLDateTimeType: CAST_FAIL(ObTimeConverter::mdatetime_to_datetime(expr_vector->get_mysql_datetime(row_idx), value, - date_sql_mode)); + date_sql_mode)); break; default: ret = OB_OBJ_TYPE_ERROR; @@ -1488,7 +1488,7 @@ int ObSelectIntoOp::calc_byte_array(const common::ObIVector* expr_vector, has_lob_header, ob_str, row_idx))) { LOG_WARN("failed to get string", K(ret)); } else if (ob_str.length() == 0 || CS_TYPE_BINARY == datum_meta.cs_type_ - || CHARSET_UTF8MB4 == ObCharset::charset_type_by_coll(datum_meta.cs_type_)) { + || CHARSET_UTF8MB4 == ObCharset::charset_type_by_coll(datum_meta.cs_type_)) { if (OB_FAIL(ob_write_string(allocator, ob_str, res_str))) { LOG_WARN("failed to write string", K(ret)); } else { @@ -1521,9 +1521,9 @@ int ObSelectIntoOp::init_parquet_env() } int ObSelectIntoOp::get_parquet_logical_type(std::shared_ptr &logical_type, - const ObObjType &obj_type, - const int32_t precision, - const int32_t scale) + const ObObjType &obj_type, + const int32_t precision, + const int32_t scale) { int ret = OB_SUCCESS; if (ObTinyIntType == obj_type) { @@ -1582,8 +1582,8 @@ int ObSelectIntoOp::get_parquet_physical_type(parquet::Type::type &physical_type || ob_is_date_or_mysql_date(obj_type) || ob_is_year_tc(obj_type)) { physical_type = parquet::Type::INT32; } else if (ObIntType == obj_type || ObUInt64Type == obj_type - || ob_is_datetime_or_mysql_datetime_tc(obj_type) - || ob_is_time_tc(obj_type) || ob_is_bit_tc(obj_type)) { + || ob_is_datetime_or_mysql_datetime_tc(obj_type) + || ob_is_time_tc(obj_type) || ob_is_bit_tc(obj_type)) { physical_type = parquet::Type::INT64; } else if (ob_is_float_tc(obj_type)) { // float, ufloat physical_type = parquet::Type::FLOAT; @@ -1592,9 +1592,9 @@ int ObSelectIntoOp::get_parquet_physical_type(parquet::Type::type &physical_type } else if (ob_is_number_or_decimal_int_tc(obj_type)) { physical_type = parquet::Type::FIXED_LEN_BYTE_ARRAY; } else if (ob_is_string_tc(obj_type) /*varchar,char,varbinary,binary*/ - || ob_is_text_tc(obj_type) /*TinyText,MediumText,Text,LongText,TinyBLOB,MediumBLOB,BLOB,LongBLOB*/ - || ob_is_enum_or_set_type(obj_type) - || ObNullType == obj_type) { + || ob_is_text_tc(obj_type) /*TinyText,MediumText,Text,LongText,TinyBLOB,MediumBLOB,BLOB,LongBLOB*/ + || ob_is_enum_or_set_type(obj_type) + || ObNullType == obj_type) { physical_type = parquet::Type::BYTE_ARRAY; } else { ret = OB_NOT_SUPPORTED; @@ -1708,7 +1708,7 @@ int ObSelectIntoOp::into_outfile_batch_parquet(const ObBatchRows &brs, ObExterna ret = OB_ERR_UNEXPECTED; LOG_WARN("get unexpected null data writer", K(ret)); } else if (parquet_data_writer->is_file_writer_null() - && OB_FAIL(parquet_data_writer->open_parquet_file_writer(arrow_alloc_, + && OB_FAIL(parquet_data_writer->open_parquet_file_writer(arrow_alloc_, external_properties_.parquet_format_.row_group_size_, external_properties_.parquet_format_.compress_type_index_, brs.size_, @@ -1721,18 +1721,18 @@ int ObSelectIntoOp::into_outfile_batch_parquet(const ObBatchRows &brs, ObExterna try { for (int64_t col_idx = 0; OB_SUCC(ret) && col_idx < select_exprs.count(); col_idx++) { if (OB_FAIL(build_parquet_cell(parquet_data_writer->get_row_group_writer(), - select_exprs.at(col_idx)->datum_meta_, - select_exprs.at(col_idx)->obj_meta_, - expr_vectors.at(col_idx), - col_idx, - row_idx, - parquet_data_writer->get_row_batch_offset(), - parquet_data_writer->get_parquet_value_offsets().at(col_idx), - parquet_data_writer->get_parquet_row_def_levels().at(col_idx), - parquet_data_writer->get_batch_allocator(), - parquet_data_writer->get_parquet_row_batch().at(col_idx), - is_strict_mode, - date_sql_mode))) { + select_exprs.at(col_idx)->datum_meta_, + select_exprs.at(col_idx)->obj_meta_, + expr_vectors.at(col_idx), + col_idx, + row_idx, + parquet_data_writer->get_row_batch_offset(), + parquet_data_writer->get_parquet_value_offsets().at(col_idx), + parquet_data_writer->get_parquet_row_def_levels().at(col_idx), + parquet_data_writer->get_batch_allocator(), + parquet_data_writer->get_parquet_row_batch().at(col_idx), + is_strict_mode, + date_sql_mode))) { LOG_WARN("failed to build parquet cell", K(ret)); } } @@ -1807,18 +1807,18 @@ int ObSelectIntoOp::check_parquet_file_size(ObParquetFileWriter &data_writer) } int ObSelectIntoOp::build_parquet_cell(parquet::RowGroupWriter* rg_writer, - const ObDatumMeta &datum_meta, - const ObObjMeta &obj_meta, - const common::ObIVector* expr_vector, - int64_t col_idx, - int64_t row_idx, - int64_t row_offset, - int64_t &value_offset, - int16_t* definition_levels, - ObIAllocator &allocator, - void* value_batch, - const bool is_strict_mode, - const ObDateSqlMode date_sql_mode) + const ObDatumMeta &datum_meta, + const ObObjMeta &obj_meta, + const common::ObIVector* expr_vector, + int64_t col_idx, + int64_t row_idx, + int64_t row_offset, + int64_t &value_offset, + int16_t* definition_levels, + ObIAllocator &allocator, + void* value_batch, + const bool is_strict_mode, + const ObDateSqlMode date_sql_mode) { int ret = OB_SUCCESS; int16_t null_definition_level = 0; @@ -1846,12 +1846,12 @@ int ObSelectIntoOp::build_parquet_cell(parquet::RowGroupWriter* rg_writer, if (expr_vector->is_null(row_idx)) { definition_levels[row_offset] = null_definition_level; } else if (OB_FAIL(calc_byte_array(expr_vector, - row_idx, - datum_meta, - obj_meta, - allocator, - buf, - res_len))) { + row_idx, + datum_meta, + obj_meta, + allocator, + buf, + res_len))) { LOG_WARN("failed to calc parquet byte array", K(ret)); } else { value->ptr = reinterpret_cast(buf); @@ -1978,10 +1978,10 @@ int ObSelectIntoOp::build_parquet_cell(parquet::RowGroupWriter* rg_writer, } int ObSelectIntoOp::calc_parquet_decimal_array(const common::ObIVector* expr_vector, - int row_idx, - const ObDatumMeta &datum_meta, - int parquet_decimal_length, - uint8_t* parquet_flba_ptr) + int row_idx, + const ObDatumMeta &datum_meta, + int parquet_decimal_length, + uint8_t* parquet_flba_ptr) { int ret = OB_SUCCESS; const ObDecimalInt* ob_decimal; @@ -2082,7 +2082,7 @@ int ObSelectIntoOp::into_varlist() //before 4_1 use output //after 4_1 use select exprs const ObIArray &select_exprs = (MY_SPEC.select_exprs_.empty()) ? - MY_SPEC.output_ : MY_SPEC.select_exprs_; + MY_SPEC.output_ : MY_SPEC.select_exprs_; const ObIArray &user_vars = MY_SPEC.user_vars_; ObArenaAllocator lob_tmp_allocator("LobTmp", OB_MALLOC_NORMAL_BLOCK_SIZE, MTL_ID()); if (select_exprs.count() != user_vars.count()) { @@ -2126,11 +2126,11 @@ int ObSelectIntoOp::extract_fisrt_wchar_from_varhcar(const ObObj &obj, int32_t & } int ObSelectIntoOp::print_wchar_to_buf(char *buf, - const int64_t buf_len, - int64_t &pos, - int32_t wchar, - ObString &str, - ObCollationType coll_type) + const int64_t buf_len, + int64_t &pos, + int32_t wchar, + ObString &str, + ObCollationType coll_type) { int ret = OB_SUCCESS; int result_len = 0; @@ -2285,8 +2285,8 @@ int ObSelectIntoOp::get_data_writer_for_partition(const ObString &partition_str, //add to hashmap if (OB_FAIL(ret)) { } else if (OB_FAIL(ob_write_string(ctx_.get_allocator(), - partition_str, - partition))) { + partition_str, + partition))) { LOG_WARN("failed to write string", K(ret)); } else if (OB_FAIL(partition_map_.set_refactored(partition, data_writer))) { LOG_WARN("failed to add data writer to map", K(ret)); @@ -2320,7 +2320,7 @@ int ObSelectIntoOp::create_the_only_data_writer(ObExternalFileWriter *&data_writ } if (OB_FAIL(ret)) { } else if (T_INTO_OUTFILE == MY_SPEC.into_type_ && MY_SPEC.is_single_ - && OB_FAIL(data_writer->open_file())) { + && OB_FAIL(data_writer->open_file())) { LOG_WARN("failed to open file", K(ret)); } else if (ObExternalFileFormat::FormatType::CSV_FORMAT == format_type_ && MY_SPEC.buffer_size_ > 0) { csv_data_writer = static_cast(data_writer); @@ -2344,7 +2344,7 @@ int ObSelectIntoOp::new_data_writer(ObExternalFileWriter *&data_writer) LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObCsvFileWriter))); } else { data_writer = new(ptr) ObCsvFileWriter(access_info_, file_location_, use_shared_buf_, - has_compress_, has_lob_, write_offset_); + has_compress_, has_lob_, write_offset_); } break; } @@ -2385,7 +2385,7 @@ void ObSelectIntoOp::destroy() ObExternalFileWriter *data_writer = NULL; if (do_partition_) { for (ObPartitionWriterMap::iterator iter = partition_map_.begin(); - iter != partition_map_.end(); iter++) { + iter != partition_map_.end(); iter++) { if (OB_ISNULL(data_writer = iter->second)) { } else { data_writer->~ObExternalFileWriter(); diff --git a/src/sql/engine/table/ob_external_table_access_service.cpp b/src/sql/engine/table/ob_external_table_access_service.cpp index 2e5fafae4..e4a9a8d6a 100644 --- a/src/sql/engine/table/ob_external_table_access_service.cpp +++ b/src/sql/engine/table/ob_external_table_access_service.cpp @@ -1,18 +1,18 @@ - /* - * Copyright (c) 2025 OceanBase. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +/* + * Copyright (c) 2025 OceanBase. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #define USING_LOG_PREFIX SQL @@ -173,11 +173,11 @@ class ObLocalFileListArrayOpWithFilter : public ObBaseDirEntryOperator { public: ObLocalFileListArrayOpWithFilter(ObIArray &name_array, - ObIArray & file_size, - const ObString &path, - const ObString &origin_path, - ObExternalPathFilter *filter, - ObIAllocator &array_allocator) + ObIArray & file_size, + const ObString &path, + const ObString &origin_path, + ObExternalPathFilter *filter, + ObIAllocator &array_allocator) : name_array_(name_array), file_size_(file_size), path_(path), origin_path_(origin_path), filter_(filter), allocator_(array_allocator) {} virtual bool need_get_file_size() const override { return true; } @@ -273,7 +273,7 @@ int ObExternalDataAccessDriver::get_file_list(const ObString &path, OZ(FileDirectoryUtils::is_directory(path_without_prifix.ptr(), is_dir)); if (!is_dir) { LOG_INFO("external location is not a directory", - K(path_without_prifix)); + K(path_without_prifix)); } else { OZ(file_dirs.push_back(path_cstring)); } @@ -284,7 +284,7 @@ int ObExternalDataAccessDriver::get_file_list(const ObString &path, ObString file_dir = file_dirs.at(i); ObLocalFileListArrayOpWithFilter dir_op(file_dirs, useless_size, file_dir, path_cstring, NULL, allocator); ObLocalFileListArrayOpWithFilter file_op(file_urls, file_sizes, file_dir, path_cstring, - pattern.empty() ? NULL : &filter, allocator); + pattern.empty() ? NULL : &filter, allocator); dir_op.set_dir_flag(); if (OB_FAIL(ObBackupIoAdapter::list_files(file_dir, access_info_, file_op))) { LOG_WARN("fail to list files", KR(ret), K(file_dir), K_(access_info)); @@ -347,9 +347,9 @@ const char * ObExternalStreamFileReader::MEMORY_LABEL = "ExternalReader"; const int64_t ObExternalStreamFileReader::COMPRESSED_DATA_BUFFER_SIZE = 2 * 1024 * 1024; int ObExternalStreamFileReader::init(const common::ObString &location, - const ObString &access_info, - ObCSVGeneralFormat::ObCSVCompression compression_format, - ObIAllocator &allocator) + const ObString &access_info, + ObCSVGeneralFormat::ObCSVCompression compression_format, + ObIAllocator &allocator) { int ret = OB_SUCCESS; if (OB_NOT_NULL(allocator_)) { @@ -543,7 +543,7 @@ int ObExternalStreamFileReader::create_decompressor(ObCSVGeneralFormat::ObCSVCom if (OB_FAIL(ObDecompressor::create(compression_format, *allocator_, decompressor_))) { LOG_WARN("failed to create decompressor", K(ret)); } else if (OB_ISNULL(compressed_data_) && - OB_ISNULL(compressed_data_ = (char *)allocator_->alloc(COMPRESSED_DATA_BUFFER_SIZE))) { + OB_ISNULL(compressed_data_ = (char *)allocator_->alloc(COMPRESSED_DATA_BUFFER_SIZE))) { ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("failed to allocate memory", K(COMPRESSED_DATA_BUFFER_SIZE)); } @@ -669,8 +669,8 @@ int ObExternalTableAccessService::revert_scan_iter(ObNewRowIterator *iter) int ObExternalTableRowIterator::init(const ObTableScanParam *scan_param) { - scan_param_ = scan_param; - return init_exprs(scan_param); + scan_param_ = scan_param; + return init_exprs(scan_param); } int ObExternalTableRowIterator::gen_ip_port(ObIAllocator &allocator) @@ -711,7 +711,7 @@ int ObExternalTableRowIterator::init_exprs(const storage::ObTableScanParam *scan if (OB_SUCC(ret) && column_exprs_.count() != scan_param->ext_column_convert_exprs_->count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("column expr not equal to convert convert expr", K(ret), - K(column_exprs_), KPC(scan_param->ext_column_convert_exprs_)); + K(column_exprs_), KPC(scan_param->ext_column_convert_exprs_)); } } return ret; @@ -856,11 +856,11 @@ DEF_TO_STRING(ObExternalIteratorState) int64_t pos = 0; J_OBJ_START(); J_KV(K_(file_idx), - K_(part_id), - K_(cur_file_id), - K_(cur_line_number), - K_(cur_file_url), - K_(part_list_val)); + K_(part_id), + K_(cur_file_id), + K_(cur_line_number), + K_(cur_file_url), + K_(part_list_val)); J_OBJ_END(); return pos; } From db1d086e177fb606bdfac30b4bcb231a2cc5188c Mon Sep 17 00:00:00 2001 From: hnwyllmm Date: Fri, 22 May 2026 11:35:15 +0800 Subject: [PATCH 5/6] Update 2 files - /src/sql/engine/basic/ob_select_into_op.cpp - /src/sql/engine/table/ob_external_table_access_service.cpp --- src/sql/engine/basic/ob_select_into_op.cpp | 5 ----- src/sql/engine/table/ob_external_table_access_service.cpp | 5 ----- 2 files changed, 10 deletions(-) diff --git a/src/sql/engine/basic/ob_select_into_op.cpp b/src/sql/engine/basic/ob_select_into_op.cpp index 6d2ecbae3..a977f1392 100644 --- a/src/sql/engine/basic/ob_select_into_op.cpp +++ b/src/sql/engine/basic/ob_select_into_op.cpp @@ -2350,10 +2350,6 @@ int ObSelectIntoOp::new_data_writer(ObExternalFileWriter *&data_writer) } case ObExternalFileFormat::FormatType::PARQUET_FORMAT: { -#ifdef OB_BUILD_EMBED_MODE - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet not supported in embed mode", K(ret)); -#else if (lib::is_embed_mode()) { ret = OB_NOT_SUPPORTED; LOG_WARN("parquet not supported in embed mode", K(ret)); @@ -2363,7 +2359,6 @@ int ObSelectIntoOp::new_data_writer(ObExternalFileWriter *&data_writer) } else { data_writer = new(ptr) ObParquetFileWriter(access_info_, file_location_, parquet_writer_schema_); } -#endif break; } case ObExternalFileFormat::FormatType::ORC_FORMAT: diff --git a/src/sql/engine/table/ob_external_table_access_service.cpp b/src/sql/engine/table/ob_external_table_access_service.cpp index e4a9a8d6a..2862f8ac4 100644 --- a/src/sql/engine/table/ob_external_table_access_service.cpp +++ b/src/sql/engine/table/ob_external_table_access_service.cpp @@ -573,15 +573,10 @@ int ObExternalTableAccessService::table_scan( } break; case ObExternalFileFormat::PARQUET_FORMAT: -#ifdef OB_BUILD_EMBED_MODE - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet not supported in embed mode", K(ret)); -#else if (OB_ISNULL(row_iter = OB_NEWx(ObParquetTableRowIterator, (scan_param.allocator_)))) { ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("alloc memory failed", K(ret)); } -#endif break; case ObExternalFileFormat::ODPS_FORMAT: if (!GCONF._use_odps_jni_connector) { From e2512cf39ed7366b55f593aa45cb736eb71dd101 Mon Sep 17 00:00:00 2001 From: "dingyixuan.dyx" Date: Fri, 22 May 2026 21:32:46 +0800 Subject: [PATCH 6/6] chore: align branch state with MR diff --- src/sql/engine/basic/ob_select_into_op.cpp | 12 +++++++----- .../table/ob_external_table_access_service.cpp | 5 +++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/sql/engine/basic/ob_select_into_op.cpp b/src/sql/engine/basic/ob_select_into_op.cpp index a977f1392..d11b9877c 100644 --- a/src/sql/engine/basic/ob_select_into_op.cpp +++ b/src/sql/engine/basic/ob_select_into_op.cpp @@ -1181,7 +1181,7 @@ int ObSelectIntoOp::write_single_char_to_file(const char *wchar, ObCsvFileWriter data_writer.set_curr_pos(pos + 1); } else if (OB_FAIL(use_shared_buf(data_writer, buf, buf_len, pos))) { LOG_WARN("failed to use shared buffer", K(ret)); - } + } } if (OB_SUCC(ret) && use_shared_buf_) { if (pos < buf_len) { @@ -2350,15 +2350,17 @@ int ObSelectIntoOp::new_data_writer(ObExternalFileWriter *&data_writer) } case ObExternalFileFormat::FormatType::PARQUET_FORMAT: { - if (lib::is_embed_mode()) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("parquet not supported in embed mode", K(ret)); - } else if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObParquetFileWriter)))) { +#ifndef OB_BUILD_EMBED_MODE + if (OB_ISNULL(ptr = ctx_.get_allocator().alloc(sizeof(ObParquetFileWriter)))) { ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("failed to allocate data writer", K(ret), K(sizeof(ObParquetFileWriter))); } else { data_writer = new(ptr) ObParquetFileWriter(access_info_, file_location_, parquet_writer_schema_); } +#else + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet is not supported in embed mode", K(ret)); +#endif // OB_BUILD_EMBED_MODE break; } case ObExternalFileFormat::FormatType::ORC_FORMAT: diff --git a/src/sql/engine/table/ob_external_table_access_service.cpp b/src/sql/engine/table/ob_external_table_access_service.cpp index 2862f8ac4..67c0749d3 100644 --- a/src/sql/engine/table/ob_external_table_access_service.cpp +++ b/src/sql/engine/table/ob_external_table_access_service.cpp @@ -573,10 +573,15 @@ int ObExternalTableAccessService::table_scan( } break; case ObExternalFileFormat::PARQUET_FORMAT: +#ifndef OB_BUILD_EMBED_MODE if (OB_ISNULL(row_iter = OB_NEWx(ObParquetTableRowIterator, (scan_param.allocator_)))) { ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("alloc memory failed", K(ret)); } +#else + ret = OB_NOT_SUPPORTED; + LOG_WARN("parquet is not supported in embed mode", K(ret)); +#endif // OB_BUILD_EMBED_MODE break; case ObExternalFileFormat::ODPS_FORMAT: if (!GCONF._use_odps_jni_connector) {