diff --git a/.gitignore b/.gitignore index 59f3cd1142..5b5dddbd8b 100755 --- a/.gitignore +++ b/.gitignore @@ -85,6 +85,7 @@ Backends/installed/** ScannerBit/downloaded/** ScannerBit/installed/** ScannerBit/lib/plugin_libraries.list +downloaded_tarball_paths.txt # GUM files *.mug diff --git a/Printers/include/gambit/Printers/printers/hdf5printer.hpp b/Printers/include/gambit/Printers/printers/hdf5printer.hpp index f6acf1b267..613b2761ea 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer.hpp @@ -332,17 +332,20 @@ namespace Gambit std::string group; // HDF5 group location to store datasets std::string metadata_group; // HDF5 group location to store metadata - // Handles for HDF5 files and groups containing the datasets - hid_t file_id; - hid_t group_id; - hid_t RA_group_id; - hid_t metadata_id; + // Handles for HDF5 files and groups containing the datasets. + // Only the primary printer's constructor assigns the file/group + // handles (file_id, group_id, RA_group_id, metadata_id). Aux instances + // inherit the *_location_id values from the primary. + hid_t file_id = -1; + hid_t group_id = -1; + hid_t RA_group_id = -1; + hid_t metadata_id = -1; // Handle to a location in a HDF5 to which the datasets will be written // i.e. a file or a group. - hid_t location_id; - hid_t RA_location_id; - hid_t metadata_location_id; + hid_t location_id = -1; + hid_t RA_location_id = -1; + hid_t metadata_location_id = -1; /// Pointer to the primary printer object // (if this is an auxilliary printer, else it is "this" //NULL) @@ -543,10 +546,13 @@ namespace Gambit // Force increment the buffer to "catch it up" to the current sync // position, in case it has been created "late". - // We subtract one because another increment will happen after - // the print statement (that triggered the creation of the new - // buffer) completes. - if(synchronised) it->second.fast_forward(printer->get_sync_pos()-1); + if (printer->get_sync_pos() > 0) // if 0, nothing to catch up to + { + // We subtract one because another increment will happen after + // the print statement (that triggered the creation of the new + // buffer) completes. + if(synchronised) it->second.fast_forward(printer->get_sync_pos()-1); + } } if( it == local_buffers.end() ) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp b/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp index 17d4b01521..8e55d47338 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp @@ -184,23 +184,20 @@ namespace Gambit { for(std::size_t i=0; i DataSetInterfaceBase::~DataSetInterfaceBase() { - // TODO: Having problems with copied objects sharing dataset identifiers, and closing datasets prematurely on each other. - // To fix, will probably need to have a fancy copy constructor or something. Or wrap datasets in an - // object which itself has a fancy copy constructor. For now, just leave dataset resources lying around, - // probably won't cause any issues. - // Or could explicity tell interface to close datasets before the objects are destroyed. - //if(this->dset_id>=0) - //{ - // herr_t status = H5Dclose(this->dset_id); - // if(status<0) - // { - // logger() << LogTags::printers << LogTags::err < inside H5P_LocalBufferManager) and + /// are populated via the assign-from-temporary pattern at + /// H5P_LocalBufferManager::get_buffer, so each buffer object's + /// destructor runs at least once on a temporary that shares state + /// (HDF5 dataset handles, sync counters) with the surviving + /// container slot. Triggering write_to_disk() / RA_write_to_disk() + /// here would therefore either double-write or write through stale + /// state, depending on which copy is being destroyed. End-of-scan + /// flushing is handled deterministically by HDF5Printer::flush() + /// and HDF5Printer::finalise() instead, which iterate over the + /// surviving buffers in the printer's all_buffers map. template - VertexBufferNumeric1D_HDF5::~VertexBufferNumeric1D_HDF5() + VertexBufferNumeric1D_HDF5::~VertexBufferNumeric1D_HDF5() { - //TODO: Do this in some more controlled way - //if(this->is_synchronised()) { write_to_disk(); } - //else - //{ - // RA_write_to_disk(); - //} } // Print out report on buffer sync status diff --git a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp index 33a177b931..14d2be3d61 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp @@ -68,10 +68,6 @@ namespace Gambit typedef long long longlong; typedef unsigned long long ulonglong; - // DEBUG h5v2_BLOCK message counters - //static int recv_counter; - //static int send_counter; - /// Length of chunks in chunked HDF5 dataset. Affects write/retrieval performance for blocks of data of various sizes. /// It is set to an "intermediate" sort of size since that seems to work well enough. static const std::size_t HDF5_CHUNKLENGTH = 100; @@ -82,25 +78,9 @@ namespace Gambit /// Largest allowed size of buffers. Size can be dynamically set from 1 to this number. static const std::size_t MAX_BUFFER_SIZE = 100000; - /// MPI tags for HDF5 printer v2 - const int h5v2_bufname(10); - const int h5v2_bufdata_points(11); - const int h5v2_bufdata_ranks(12); - const int h5v2_bufdata_valid(13); - const int h5v2_bufdata_type(14); - const int h5v2_bufdata_values(15); - // Message block end tag: - // 1 means "there is another block of messages to receive" - // 0 means "there are no more blocks of messages to receive" - const int h5v2_BLOCK(30); - // "Begin sending data" tag - const int h5v2_BEGIN(31); - - // The 'h5v2_bufdata_type' messages send an integer encoding - // the datatype for the h5v2_bufdata_values messages + // The integer-encoded type ID for buffer-data messages. // Need a unique integer for each type. We can encode these // with a template function: - template std::set set_diff(const std::set& set1, const std::set& set2) { @@ -308,6 +288,9 @@ namespace Gambit errmsg << "Error! Tried to write to dataset (name="<,std::vector> flush_to_vector_int(const std::vector& order) = 0; -#ifdef WITH_MPI - /// Send buffer contents to another process - virtual void MPI_flush_to_rank(const unsigned int r) = 0; - -#endif - /// Make sure datasets exist on disk with the correct name and size virtual void ensure_dataset_exists(const hid_t loc_id, const std::size_t length) = 0; @@ -905,102 +882,6 @@ namespace Gambit return buffer.size(); } -#ifdef WITH_MPI - // Send buffer contents to a different process - void MPI_flush_to_rank(const unsigned int r) - { - if(buffer.size()>0) - { - // Get name of the dataset this buffer is associated with - std::string namebuf = dset_name(); - // Copy point data and values into MPI send buffer - std::vector pointIDs; - std::vector ranks; // Will assume all PPIDpairs are valid. I think this is fine to do... - std::vector valid; // We have to send the invalid points too, to maintain buffer synchronicity - std::vector values; - int type(h5v2_type()); // Get integer identifying the type of the data values - int more_buffers = 1; // Flag indicating that there is a block of data to receive - for(auto it=buffer.begin(); it!=buffer.end(); ++it) - { - pointIDs.push_back(it->first.pointID); - ranks .push_back(it->first.rank); - valid .push_back(buffer_valid.at(it->first)); - values .push_back(it->second); - } - - // Debug info - #ifdef HDF5PRINTER2_DEBUG - logger()< pointIDs(Npoints); - std::vector ranks(Npoints); - std::vector valid(Npoints); - std::vector values(Npoints); - - // Receive buffer data - myComm.Recv(&values[0] , Npoints, r, h5v2_bufdata_values); - myComm.Recv(&pointIDs[0], Npoints, r, h5v2_bufdata_points); - myComm.Recv(&ranks[0] , Npoints, r, h5v2_bufdata_ranks); - myComm.Recv(&valid[0] , Npoints, r, h5v2_bufdata_valid); - - // Pack it into this buffer - #ifdef HDF5PRINTER2_DEBUG - logger()<& buffer = get_buffer(dset_name, buffered_points); - //std::cout<<"(rank "< void MPI_add_int_block_to_buffer(const HDF5bufferchunk& chunk, const std::string& dset_name, const std::size_t dset_index) diff --git a/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp b/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp index daa3fa7122..50a9f572ff 100644 --- a/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp +++ b/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp @@ -536,13 +536,9 @@ namespace Gambit size_tot_l = size_tot + size; // Last? //} - for (auto it = valids.end()-1; size > 0; --it) - { - if (*it) - break; - else - --size; - } + // Trim any trailing invalid points (size <= valids.size() ensured above) + while (size > 0 && !valids[size-1]) + --size; HDF5::closeSpace(dataspace); HDF5::closeSpace(dataspace2); @@ -804,13 +800,9 @@ namespace Gambit ranks.push_back(valid_rank); ptids.push_back(valid_ptid); - for (auto it = valids.end()-1; size > 0; --it) - { - if (*it) - break; - else - --size; - } + // Trim any trailing invalid points (size <= valids.size() ensured above) + while (size > 0 && !valids[size-1]) + --size; aux_sizes.push_back(size); HDF5::closeSpace(dataspace); @@ -992,6 +984,8 @@ namespace Gambit // Close resources HDF5::closeDataset(dataset_out); HDF5::closeDataset(dataset2_out); + H5Tclose(type); + H5Tclose(type2); // Move offset so that next batch is written to correct place in output file offset += batch_size_tot + pc_offset; @@ -1152,6 +1146,8 @@ namespace Gambit } // Create new dataset setup_hdf5_points(new_group, type, type2, size_tot, *it); + if(type>=0) H5Tclose(type); + if(type2>=0) H5Tclose(type2); } // Reopen output datasets for copying hid_t dataset_out = HDF5::openDataset(new_group, *it); @@ -1307,6 +1303,7 @@ namespace Gambit // Close resources HDF5::closeDataset(dataset_out); + H5Tclose(type); // Move offset so that next batch is written to correct place in output file offset += batch_size_tot + pc_offset; diff --git a/Printers/src/printers/hdf5printer/hdf5printer.cpp b/Printers/src/printers/hdf5printer/hdf5printer.cpp index 0f861b3f7c..32f9a3e9ef 100644 --- a/Printers/src/printers/hdf5printer/hdf5printer.cpp +++ b/Printers/src/printers/hdf5printer/hdf5printer.cpp @@ -465,7 +465,16 @@ namespace Gambit printer_error().raise(LOCAL_INFO, errmsg.str()); } HDF5::closeFile(file_id); - exit(1); + // Group does not exist in the pre-existing file. Refuse to + // proceed: we are not in resume mode and 'delete_file_on_restart' + // is not set, so adding a new group to a file from an unrelated + // previous run risks silently mixing unrelated data. + std::ostringstream errmsg2; + errmsg2 << "Error preparing pre-existing output file '"<file_id, H5F_SCOPE_GLOBAL); if(err<0) { diff --git a/Printers/src/printers/hdf5printer/hdf5tools.cpp b/Printers/src/printers/hdf5printer/hdf5tools.cpp index 720f25d1eb..df8606caa6 100644 --- a/Printers/src/printers/hdf5printer/hdf5tools.cpp +++ b/Printers/src/printers/hdf5printer/hdf5tools.cpp @@ -667,9 +667,15 @@ namespace Gambit { /// Close hdf5 group SIMPLE_CALL(hid_t, closeGroup, hid_t, H5Gclose, "close", "group", "group") - /// global error variables (handler) - H5E_auto2_t old_func; - void *old_client_data; + /// State for errorsOff() / errorsOn(). The bool guard errors_are_off + /// ensures that a redundant errorsOff() called while errors are already + /// off does NOT overwrite the saved handler with the NULL handler. + namespace + { + bool errors_are_off = false; + H5E_auto2_t saved_func = nullptr; + void *saved_client_data = nullptr; + } // FIXME: This caused compile problems on LISA cluster (CW) /// Silence error report (e.g. while probing for file existence) @@ -683,18 +689,21 @@ namespace Gambit { /// and let me know if it works :) void errorsOff() { + if (errors_are_off) return; // already off; don't clobber saved state /* Save old error handler */ - H5Eget_auto2(H5E_DEFAULT, &old_func, &old_client_data); - + H5Eget_auto2(H5E_DEFAULT, &saved_func, &saved_client_data); /* Turn off error handling */ H5Eset_auto2(H5E_DEFAULT, NULL, NULL); + errors_are_off = true; } /// Restore error report void errorsOn() { + if (!errors_are_off) return; // unbalanced / already on; no-op /* Restore previous error handler */ - H5Eset_auto2(H5E_DEFAULT, old_func, old_client_data); + H5Eset_auto2(H5E_DEFAULT, saved_func, saved_client_data); + errors_are_off = false; } /// @{ Dataset manipulations diff --git a/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp b/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp index 410b27abe3..b6cbf39f50 100644 --- a/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp +++ b/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp @@ -660,108 +660,6 @@ namespace Gambit } #ifdef WITH_MPI - /// Send buffer data to a specified process - void HDF5MasterBuffer::MPI_flush_to_rank(const unsigned int r) - { - for(auto it=all_buffers.begin(); it!=all_buffers.end(); ++it) - { - it->second->MPI_flush_to_rank(r); - } - buffered_points.clear(); - buffered_points_set.clear(); - } - - /// Give process r permission to begin sending its buffer data - // Don't do this for all processes at once, as MPI can run out of - // Recv request IDs behind the scenes if thousands of processes are - // trying to send it lots of stuff at once. But it is good to let - // many processes start sending data before we need it, so that the - // Recv goes quickly (and is effectively already done in the background) - // when we get to it. - void HDF5MasterBuffer::MPI_request_buffer_data(const unsigned int r) - { - logger()<< LogTags::printers << LogTags::info << "Asking process "<(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - //case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - //case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - default: - std::ostringstream errmsg; - errmsg<<"Unrecognised datatype integer (value = "<max_Npoints) max_Npoints = Npoints; - } - } - - logger()<second->N_items_in_buffer()<<" (name="<second->dset_name()<<")"<& blocks, const std::vector>& buf_types) { @@ -1875,17 +1773,32 @@ namespace Gambit // Flush data in buffers to disk void HDF5Printer2::flush() { + // Each rank writes its own buffered points under the cross-rank + // Utils::FileLock taken inside HDF5MasterBuffer::flush(); no MPI + // gather here. The collective gather happens only in finalise(). + // Auxilliary printers register their buffermaster into the primary's + // aux_buffers (see constructor) and have an empty aux_buffers + // themselves, so only the primary needs to iterate. + // Sync streams before RA streams: matches finalise()'s ordering, + // since RA writes target positions established by sync writes. buffermaster.flush(); + if(not is_auxilliary_printer()) + { + for(auto it=aux_buffers.begin(); it!=aux_buffers.end(); ++it) + { + if((*it)->is_synchronised()) (*it)->flush(); + } + for(auto it=aux_buffers.begin(); it!=aux_buffers.end(); ++it) + { + if(not (*it)->is_synchronised()) (*it)->flush(); + } + } } // Make sure printer output is fully on disk and safe // No distinction between final and early termination procedure for this printer. void HDF5Printer2::finalise(bool /*abnormal*/) { - // DEBUG h5v2_BLOCK message counter - //recv_counter = 0; - //send_counter = 0; - // The primary printer will take care of finalising all output. if(not is_auxilliary_printer()) { @@ -1955,16 +1868,6 @@ namespace Gambit std::ostringstream buffer_nonempty_report; bool buffer_nonempty_warn(false); - std::size_t final_size; - if(myRank==0) - { - /// Need to know final nominal dataset size to ensure unsynchronised datasets match synchronised ones. - buffermaster.lock_and_open_file(); - final_size = buffermaster.get_next_free_position(); - buffermaster.close_and_unlock_file(); - std::cout<<"Final dataset size is "<is_synchronised()) + /// Need to know final nominal dataset size to ensure unsynchronised datasets match synchronised ones. + buffermaster.lock_and_open_file(); + std::size_t final_size = buffermaster.get_next_free_position(); + buffermaster.close_and_unlock_file(); + std::cout<<"Final dataset size is "<flush(); - // Check if everything managed to flush! - if(not (*it)->all_buffers_empty()) + if(not (*it)->is_synchronised()) { - buffer_nonempty_report<<(*it)->buffer_status(); - buffer_nonempty_warn = true; + (*it)->flush(); + // Check if everything managed to flush! + if(not (*it)->all_buffers_empty()) + { + buffer_nonempty_report<<(*it)->buffer_status(); + buffer_nonempty_warn = true; + } + // Make sure final dataset size is correct for the unsynchronised buffers + (*it)->extend_all_datasets_to(final_size); } - // Make sure final dataset size is correct for the unsynchronised buffers - (*it)->extend_all_datasets_to(final_size); } } @@ -2020,10 +1939,6 @@ namespace Gambit } logger()<< LogTags::printers << LogTags::info << "HDF5Printer2 output finalisation complete."<