@mastersthesis {7291, title = {Hydrographic Data Processing on a Robust, Network-Coupled Parallel Cluster}, volume = {Computer Science}, year = {2015}, month = {December}, pages = {100}, school = {University of New Hampshire}, address = {Durham, NH}, abstract = {

There have been tremendous advances in acoustic sensor technologies and widespread adoption of multibeam echo-sounders in the recent past, which have enabled the efficient collection of large quantities of bathymetric data in every survey. However, timely dissemination of this data to the scientific community has been constrained by the relatively slow progress in the development of new data processing architectures. The current solutions for powerful, efficient and near-real time data processing systems entail high capital investments and technical complexities. Therefore, the installation base for these systems has been very small. The work presented here proposes a new architecture for bathymetric data processing based on parallel computing paradigms. The solution works by distributing the processing workload across a cluster of network-attached compute nodes. The success of using parallel processing for bathymetric data depends on the accurate measurement of the processing workload and its effective distribution across the compute nodes, thereby maximizing speedup and efficiency. These compute resources can be existing installations and other COTS components, such as blade servers, thereby reducing installation and maintenance expenditure.

For workload determination, an estimation algorithm was developed that uses stochastic sampling of the raw bathymetric data file. This produces a low cost and high accuracy estimate of the processing requirements for each line to be processed. This workload information, coupled with file and system metadata, is used as input to different load balancing algorithms - First Come First Served (FCFS), Longest Job First (LJF) and Contention-Reduction (CR). The performance of FCFS and LJF algorithms is highly dependent on the characteristics of the input dataset while CR scheduling aims to characterize the input and adjust load distribution for the best combination of speedup and efficiency. The choice of these algorithms depends on the requirements of the installation, i.e. prioritization of speedup or efficiency. To ensure robustness, watchdog mechanisms monitor the state of all the components of the processing system and can react to system faults and failures, through a combination of automated and manual techniques. Although not part of the current implementation, there is potential for adding redundant critical components and to enable live-failover, thereby reducing or eliminating system downtime.

The methods for workload estimation and distribution are templates for extending this framework to include additional types of bathymetric data and develop flexible, self-learning algorithms to deal with diverse datasets. This research lays the groundwork for the design of a ship-based system that would enable near-real time data processing and result in a faster ping-to-chart solution.

}, url = {https://scholars.unh.edu/thesis/1060}, author = {Venugopal, Rohit} } @article {4966, title = {Hydrographic Data Processing on a Robust, Network-Coupled Parallel Cluster}, year = {2012}, month = {Feb 21 - Feb 26}, address = {Wellington, New Zealand}, keywords = {Data Processing - CUBE}, url = {http://www.conference.co.nz/shallowsurvey}, author = {Brian R Calder and Venugopal, Rohit} } @article {5036, title = {Hydrographic Data Processing on a Robust, Network-Coupled Parallel Cluster}, year = {2012}, month = {Feb 21 - Feb 26}, address = {Wellington, New Zealand}, abstract = {

Increasing data volumes and adoption of computer-assisted hydrographic data processing algorithms necessitate higher data processing rates if gains in efficiency achieved in the last decade are to be maintained and enhanced.\  Recent advances in desktop computer architectures have made multi-core and multi-processor systems readily available, and some advances have been made in implementing multi-threaded versions of common hydrographic data processing algorithms.\  In many cases, however, although the algorithms might be ideal for parallel implementation (so called \‘embarrassingly parallel\’ tasks), limitations in memory, disc and network bandwidth within a single system can have significant limitations on the scalability of these solutions.

Offloading the computational requirements to a separate, clustered system of multiple computers is therefore appealing, since it has the potential for much higher net bandwidth, and robustness, without the collateral constraints of a desktop system.\  We consider, therefore, the advantages, potential efficiency gains, and difficulties, of processing hydrographic data in a robust, network-coupled, parallel cluster of computers.\  In particular, we address the problems of efficient and robust data distribution, compute load and network balancing, and of ensuring task- and system-level robustness in such a distributed system.

To illustrate the problem, we have considered two common processing tasks: pre-processing of raw Multibeam Echosounder (MBES) data to the stage of uncertainty-attributed resolved soundings in the local level, and computation of most-probable depths with a CUBE-like algorithm.\  These tasks illustrate a time- and spatially-indexed processing problem, respectively, which can engender differences in optimal data distribution and have different data- and network-use patterns.\  We demonstrate the gains and limitations of a clustered compute solution in these two cases, using the metrics of computational time as a function of processor resources committed, and robustness of processing in the face of intermittent random failures, as applied to (portions of) the Shallow Survey 2012 Common Data Set.

}, keywords = {CHRT, Data Processing, Parallel Processing}, author = {Brian R Calder and Venugopal, Rohit} }