From 3f31831a605464777be30bd6bcff107dd11339fb Mon Sep 17 00:00:00 2001 From: Divy Patel Date: Mon, 4 Dec 2023 21:59:12 -0600 Subject: [PATCH 1/5] fix: record parsing when reading from storage as a run page Signed-off-by: Divy Patel --- External-Sort/StorageDevice.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/External-Sort/StorageDevice.cpp b/External-Sort/StorageDevice.cpp index f3885ea..04faecd 100644 --- a/External-Sort/StorageDevice.cpp +++ b/External-Sort/StorageDevice.cpp @@ -156,6 +156,9 @@ void StorageDevice::spill_run(char run_bit, uint run, vector record int last_run = this->get_last_run(); run_path = this->device_path + "/run_" + to_string(last_run + 1); } else if (run_bit == 't') { + string trace_str = "STATE -> SPILL_RUNS_" + this->device_path + ": Spill sorted runs to the " + this->device_path + " device"; + + trace.append_trace(trace_str); run_path = this->device_path + "/temp_run"; } else { run_path = this->device_path + "/run_" + to_string(run); From 8e1c03eddf25c53546f051051159a61d72ab067a Mon Sep 17 00:00:00 2001 From: Divy Patel Date: Wed, 6 Dec 2023 15:05:22 -0600 Subject: [PATCH 2/5] feat: add ExternalSort project documentation adhereing to canvas course project instructions Signed-off-by: Divy Patel --- External-Sort/DataRecord.cpp | 2 +- External-Sort/Tree.cpp | 563 ++++++++++++++++++++--------------- External-Sort/Tree.h | 2 +- README.md | 64 +++- TODO | 4 - 5 files changed, 380 insertions(+), 255 deletions(-) diff --git a/External-Sort/DataRecord.cpp b/External-Sort/DataRecord.cpp index 073d798..23b1183 100644 --- a/External-Sort/DataRecord.cpp +++ b/External-Sort/DataRecord.cpp @@ -119,7 +119,7 @@ bool DataRecord::is_smaller_int (const DataRecord incoming_record) const return false; } -bool DataRecord::is_smaller_str(const DataRecord incoming_record) const +bool DataRecord::is_smaller_str (const DataRecord incoming_record) const { int incoming_length = 0 ; char const *incoming_iter = NULL; int current_length = 0 ; char const *current_iter = NULL; diff --git a/External-Sort/Tree.cpp b/External-Sort/Tree.cpp index 6544769..3c3aaa1 100755 --- a/External-Sort/Tree.cpp +++ b/External-Sort/Tree.cpp @@ -8,36 +8,45 @@ #define NODE_RECORD_LIST(node) node->list->record_ptr #define NODE_RECORD_LIST_AT(node, idx) node->list->record_ptr[idx] #define NODE_RECORD_LIST_LENGTH(node) node->list->record_count -#define CHECK_SET_EMPTY(node, node_idx) if (node != NULL) {\ - if (node->list->record_ptr.empty()) {\ - node->is_empty = true;\ - }\ - } +#define CHECK_SET_EMPTY(node, node_idx) \ + if (node != NULL) \ + { \ + if (node->list->record_ptr.empty()) \ + { \ + node->is_empty = true; \ + } \ + } #define SET_INTERNAL_EMPTY(node) node->current_record = NULL; -DataRecord* pop_record(RecordList *list) { - DataRecord* top = NULL; - if (list == NULL) { +DataRecord *pop_record(RecordList *list) +{ + DataRecord *top = NULL; + if (list == NULL) + { return NULL; } - if (!list->record_ptr.empty()) { + if (!list->record_ptr.empty()) + { top = new DataRecord(list->record_ptr.front()); // top = &temp; list->record_ptr.pop_front(); - #if DEBUG_PRINT - // top->print(); - #endif +#if DEBUG_PRINT +// top->print(); +#endif list->record_count--; } return top; } -DataRecord* top_record(RecordList *list) { +DataRecord *top_record(RecordList *list) +{ DataRecord *top = NULL; - if (list == NULL) { + if (list == NULL) + { return NULL; } - if (!list->record_ptr.empty()){ + if (!list->record_ptr.empty()) + { top = new DataRecord(list->record_ptr.front()); // return &list->record_ptr.front(); return top; @@ -46,8 +55,8 @@ DataRecord* top_record(RecordList *list) { } /* -* Pass the list of sorted runs as part of structure RecordList -*/ + * Pass the list of sorted runs as part of structure RecordList + */ Tree::Tree(vector sorted_runs) { @@ -56,31 +65,37 @@ Tree::Tree(vector sorted_runs) this->total_nodes = 2 * pow(2, this->tree_depth) - 1; this->heap = new struct Node[this->total_nodes]; this->total_leaves = pow(2, this->tree_depth); - llint first_leaf_node = this->total_nodes - ((this->total_nodes - 1)/2) - 1; + llint first_leaf_node = this->total_nodes - ((this->total_nodes - 1) / 2) - 1; lluint jj = 0, current_run = 0; lluint ii = first_leaf_node; - for (lluint ii = 0 ; ii < this->total_nodes ; ii++) { + for (lluint ii = 0; ii < this->total_nodes; ii++) + { this->heap[ii].list = NULL; this->heap[ii].current_record = NULL; } - for ( ; ii < (this->total_leaves*2) - 1 ; ii++) { + for (; ii < (this->total_leaves * 2) - 1; ii++) + { this->heap[ii].current_record = NULL; this->heap[ii].is_empty = false; this->heap[ii].is_leaf = true; this->heap[ii].list = sorted_runs[jj]; current_run++; - if (current_run < sorted_runs.size()) { + if (current_run < sorted_runs.size()) + { jj += 1; // printf("%p\n", (void*)each_run); - } else { + } + else + { ii++; break; } } - if (ii < ((this->total_leaves*2) - 1)) { + if (ii < ((this->total_leaves * 2) - 1)) + { this->heap[ii].current_record = NULL; this->heap[ii].is_empty = true; this->heap[ii].is_leaf = true; @@ -96,18 +111,19 @@ Tree::Tree(vector sorted_runs) * * Leaf nodes can contain only list (which may have 1/more records), * or can be empty as well (can happen after a few runs are merged) * It also stores the count of records in that particular node. - * + * * * By default, we will try to create a full binary tree. - * + * */ Tree::Tree(DataRecord *records, llint record_ct, llint initial_run) { // TODO: See if this is optimal division for fanning - this->total_leaves = record_ct/2; + this->total_leaves = record_ct / 2; DataRecord *current_ptr = records; - llint count_of_cols_per_row = ceil(record_ct/this->total_leaves); + llint count_of_cols_per_row = ceil(record_ct / this->total_leaves); - if (initial_run) { + if (initial_run) + { count_of_cols_per_row = 1; this->total_leaves = record_ct; } @@ -117,14 +133,15 @@ Tree::Tree(DataRecord *records, llint record_ct, llint initial_run) this->total_record_count = record_ct; this->heap = new struct Node[this->total_nodes]; - llint first_leaf_node = this->total_nodes - ((this->total_nodes - 1)/2) - 1; + llint first_leaf_node = this->total_nodes - ((this->total_nodes - 1) / 2) - 1; llint current_ct = record_ct; int start = 0; // We always try to generate full binary tree at the beginning // (last leaf may not be balanced) - for (lluint ii = first_leaf_node ; ii < (this->total_leaves*2) - 1 ; ii++) { + for (lluint ii = first_leaf_node; ii < (this->total_leaves * 2) - 1; ii++) + { // Leaf nodes has no current record this->heap[ii].current_record = NULL; this->heap[ii].is_empty = false; @@ -132,7 +149,8 @@ Tree::Tree(DataRecord *records, llint record_ct, llint initial_run) // Assign records to each row this->heap[ii].list = new RecordList; - for (int ii = start; ii < start + count_of_cols_per_row; ii++) { + for (int ii = start; ii < start + count_of_cols_per_row; ii++) + { this->heap[ii].list->record_ptr.push_back(*current_ptr); current_ptr++; } @@ -140,19 +158,24 @@ Tree::Tree(DataRecord *records, llint record_ct, llint initial_run) // this->heap[ii].list->record_ptr = current_ptr; this->heap[ii].list->record_count = count_of_cols_per_row; - if (current_ct > 0) { + if (current_ct > 0) + { start += count_of_cols_per_row; current_ct -= count_of_cols_per_row; - } else { + } + else + { break; } // Sample calculation: - // For 128 records, there will be 7 rows == + // For 128 records, there will be 7 rows == // 18 count_of_cols_per_row + 2 remaining // for last row, current_ct will be 2 after subtraction, // so we will just add it to the last row - if (initial_run == 0) { - if ((current_ct > 0) && (current_ct <= 2*count_of_cols_per_row)) { + if (initial_run == 0) + { + if ((current_ct > 0) && (current_ct <= 2 * count_of_cols_per_row)) + { count_of_cols_per_row = current_ct; current_ct = -1; } @@ -160,173 +183,226 @@ Tree::Tree(DataRecord *records, llint record_ct, llint initial_run) } } -llint Tree::capacity(llint level) { - return (1<heap[parent]); - // None of the children are valid == not being used as runs - if (child_left >= unused_leaves_idx) { - return; - } else if (child_left < unused_leaves_idx /*Only left child is valid, special case only valid at leaf nodes */ - && child_right >= unused_leaves_idx) { - struct Node *left_child_node = &this->heap[child_left]; - // If parent has a valid record, it will be the winner, so skip. - // Iteration over the next level will empty this node - // the iteration after that for this level would fill it up. - if (parent_node->current_record) { - return; - } else { - if (IS_LEAF_NODE(left_child_node)) { - parent_node->current_record = pop_record(left_child_node->list); - } - } - } else { /* Both the children are valid. So add the popped record at parent (if empty) */ - if (parent_node->current_record) { - return; - } else { + // None of the children are valid == not being used as runs + if (child_left >= unused_leaves_idx) + { + return; + } + else if (child_left < unused_leaves_idx /*Only left child is valid, special case only valid at leaf nodes */ + && child_right >= unused_leaves_idx) + { + struct Node *left_child_node = &this->heap[child_left]; + // If parent has a valid record, it will be the winner, so skip. + // Iteration over the next level will empty this node + // the iteration after that for this level would fill it up. + if (parent_node->current_record) + { + return; + } + else + { + if (IS_LEAF_NODE(left_child_node)) + { + parent_node->current_record = pop_record(left_child_node->list); + } + } + } + else + { /* Both the children are valid. So add the popped record at parent (if empty) */ + if (parent_node->current_record) + { + return; + } + else + { struct Node *left_child_node = &(this->heap[child_left]); - struct Node *right_child_node = &(this->heap[child_right]); - DataRecord *left_data = NULL, *right_data = NULL; - if (IS_LEAF_NODE(left_child_node)) { - // Both will be a leaf node - left_data = top_record(left_child_node->list); - right_data = top_record(right_child_node->list); - if ((right_data != NULL) & (left_data != NULL)) { - // Compare with OVCs only if both the ovcs exist - if ((left_data->ovc != 0) & (right_data->ovc != 0)) { - if (left_data->ovc < right_data->ovc) { - // If OVC is strictly smaller, the data record is small - parent_node->current_record = pop_record(left_child_node->list); - CHECK_SET_EMPTY(left_child_node, child_left); - delete left_data; delete right_data; - return; - } else if (left_data->ovc > right_data->ovc) { - // If OVC is strictly greater, the data record is larger - parent_node->current_record = pop_record(right_child_node->list); - CHECK_SET_EMPTY(right_child_node, child_right); - delete left_data; delete right_data; - return; - } - } - // If OVC do not exist for either or are equal, we need to check their actual - // values and update OVC based on the new winner - if (left_data->is_smaller_str(*right_data)) { + struct Node *right_child_node = &(this->heap[child_right]); + DataRecord *left_data = NULL, *right_data = NULL; + if (IS_LEAF_NODE(left_child_node)) + { + // Both will be a leaf node + left_data = top_record(left_child_node->list); + right_data = top_record(right_child_node->list); + if ((right_data != NULL) & (left_data != NULL)) + { + // Compare with OVCs only if both the ovcs exist + if ((left_data->ovc != 0) & (right_data->ovc != 0)) + { + if (left_data->ovc < right_data->ovc) + { + // If OVC is strictly smaller, the data record is small + parent_node->current_record = pop_record(left_child_node->list); + CHECK_SET_EMPTY(left_child_node, child_left); + delete left_data; + delete right_data; + return; + } + else if (left_data->ovc > right_data->ovc) + { + // If OVC is strictly greater, the data record is larger + parent_node->current_record = pop_record(right_child_node->list); + CHECK_SET_EMPTY(right_child_node, child_right); + delete left_data; + delete right_data; + return; + } + } + // If OVC do not exist for either or are equal, we need to check their actual + // values and update OVC based on the new winner + if (left_data->is_smaller_str(*right_data)) + { // left_data->print(); right_data->print(); - // Left is the winner -> set OVC of right relative to left - right_data->populate_ovc_int(*left_data); - parent_node->current_record = pop_record(left_child_node->list); - CHECK_SET_EMPTY(left_child_node, child_left); - delete left_data; delete right_data; - return; - } else { - // Right is the winner -> set OVC of left relative to right - left_data->populate_ovc_int(*right_data); - parent_node->current_record = pop_record(right_child_node->list); - CHECK_SET_EMPTY(right_child_node, child_right); - delete left_data; delete right_data; - return; - } - } else if (left_data) { - parent_node->current_record = pop_record(left_child_node->list); - CHECK_SET_EMPTY(left_child_node, child_left); + // Left is the winner -> set OVC of right relative to left + right_data->populate_ovc_int(*left_data); + parent_node->current_record = pop_record(left_child_node->list); + CHECK_SET_EMPTY(left_child_node, child_left); + delete left_data; + delete right_data; + return; + } + else + { + // Right is the winner -> set OVC of left relative to right + left_data->populate_ovc_int(*right_data); + parent_node->current_record = pop_record(right_child_node->list); + CHECK_SET_EMPTY(right_child_node, child_right); + delete left_data; + delete right_data; + return; + } + } + else if (left_data) + { + parent_node->current_record = pop_record(left_child_node->list); + CHECK_SET_EMPTY(left_child_node, child_left); delete left_data; - return; - } else if (right_data) { - parent_node->current_record = pop_record(right_child_node->list); - CHECK_SET_EMPTY(right_child_node, child_right); + return; + } + else if (right_data) + { + parent_node->current_record = pop_record(right_child_node->list); + CHECK_SET_EMPTY(right_child_node, child_right); delete right_data; - return; - } else { - parent_node->current_record = NULL; - return; - // At this point, both of the left and right should have been reported as empty, so no need to update. - } - } else { - // It is an internal node - left_data = NODE_CURRENT_RECORD(left_child_node); - right_data = NODE_CURRENT_RECORD(right_child_node); - if (left_data && right_data) { - // Compare with the OVCs only, if they exist - if ((left_data->ovc != 0) & (right_data->ovc != 0)) { - if (left_data->ovc < right_data->ovc) { - // If OVC is strictly smaller, the data record is small - parent_node->is_empty = false; - parent_node->current_record = left_data; - SET_INTERNAL_EMPTY(left_child_node); - return; - } else if (left_data->ovc > right_data->ovc) { - // If OVC is strictly greater, the data record is larger - parent_node->is_empty = false; - parent_node->current_record = right_data; - SET_INTERNAL_EMPTY(right_child_node); - return; - } - } - // If OVC are equal or do not exist for either, we need to check - // their actual values and update OVC based on the new winner - if (left_data->is_smaller_str(*right_data)) { - parent_node->is_empty = false; - right_data->populate_ovc_int(*left_data); - parent_node->current_record = left_data; - SET_INTERNAL_EMPTY(left_child_node); - return; - } else { - parent_node->is_empty = false; - left_data->populate_ovc_int(*right_data); - parent_node->current_record = right_data; - SET_INTERNAL_EMPTY(right_child_node); - return; - } - } else if (left_data) { - parent_node->is_empty = false; - parent_node->current_record = left_data; - NODE_CURRENT_RECORD(left_child_node) = NULL; - } else if (right_data) { - parent_node->is_empty = false; - parent_node->current_record = right_data; - NODE_CURRENT_RECORD(right_child_node) = NULL; - } else { - parent_node->is_empty = true; - parent_node->current_record = NULL; - } - } - } - } + return; + } + else + { + parent_node->current_record = NULL; + return; + // At this point, both of the left and right should have been reported as empty, so no need to update. + } + } + else + { + // It is an internal node + left_data = NODE_CURRENT_RECORD(left_child_node); + right_data = NODE_CURRENT_RECORD(right_child_node); + if (left_data && right_data) + { + // Compare with the OVCs only, if they exist + if ((left_data->ovc != 0) & (right_data->ovc != 0)) + { + if (left_data->ovc < right_data->ovc) + { + // If OVC is strictly smaller, the data record is small + parent_node->is_empty = false; + parent_node->current_record = left_data; + SET_INTERNAL_EMPTY(left_child_node); + return; + } + else if (left_data->ovc > right_data->ovc) + { + // If OVC is strictly greater, the data record is larger + parent_node->is_empty = false; + parent_node->current_record = right_data; + SET_INTERNAL_EMPTY(right_child_node); + return; + } + } + // If OVC are equal or do not exist for either, we need to check + // their actual values and update OVC based on the new winner + if (left_data->is_smaller_str(*right_data)) + { + parent_node->is_empty = false; + right_data->populate_ovc_int(*left_data); + parent_node->current_record = left_data; + SET_INTERNAL_EMPTY(left_child_node); + return; + } + else + { + parent_node->is_empty = false; + left_data->populate_ovc_int(*right_data); + parent_node->current_record = right_data; + SET_INTERNAL_EMPTY(right_child_node); + return; + } + } + else if (left_data) + { + parent_node->is_empty = false; + parent_node->current_record = left_data; + NODE_CURRENT_RECORD(left_child_node) = NULL; + } + else if (right_data) + { + parent_node->is_empty = false; + parent_node->current_record = right_data; + NODE_CURRENT_RECORD(right_child_node) = NULL; + } + else + { + parent_node->is_empty = true; + parent_node->current_record = NULL; + } + } + } + } } -struct Node Tree::leaf(llint index, llint current_slot) { - return this->heap[current_slot*2 + index]; +struct Node Tree::leaf(llint index, llint current_slot) +{ + return this->heap[current_slot * 2 + index]; } -struct Node Tree::parent(llint current_slot) { - return this->heap[current_slot/2]; +struct Node Tree::parent(llint current_slot) +{ + return this->heap[current_slot / 2]; } /* -* Each call runs the tree once, to generate one entry of merged run -*/ -void Tree::run_tree() { + * Each call runs the tree once, to generate one entry of merged run + */ +void Tree::run_tree() +{ llint unused_leaves_idx = (this->total_nodes + 1) / 2 - 1 + this->total_leaves; // Each iteration will give one of the priority queue elements, // run for each of the inner nodes for (llint inner_node_idx = this->total_nodes - pow(2, this->tree_depth) - 1; - inner_node_idx >= 0; - inner_node_idx--) { - this->compare_and_swap(inner_node_idx, unused_leaves_idx); + inner_node_idx >= 0; + inner_node_idx--) + { + this->run_tournament(inner_node_idx, unused_leaves_idx); } #if DEBUG_PRINT - cout<<"The heap in iteration "<print_heap(); #endif // this->heap[0].current_record->print(); @@ -335,73 +411,86 @@ void Tree::run_tree() { } /* -* Prints (index Empty) for empty nodes -* Prints "count -> [heap_index @ list_index :: (datarecord)]" list -* -* For e.g.: -* (1 Empty ) -* (2 Empty ) -* Count: 0 -> -* Count: 1 -> [5 @ 0 :: (6, 6, 6)] -* Count: 3 -> [6 @ 0 :: (7, 7, 7)] [6 @ 1 :: (8, 8, 8)] [6 @ 2 :: (9, 9, 9)] -*/ -void Tree::print_heap() { - cout<<"Tree depth: "<tree_depth+1<<", Total nodes: "<total_nodes<<", Total leaves: "<total_leaves<total_nodes; ii++) { + * Prints (index Empty) for empty nodes + * Prints "count -> [heap_index @ list_index :: (datarecord)]" list + * + * For e.g.: + * (1 Empty ) + * (2 Empty ) + * Count: 0 -> + * Count: 1 -> [5 @ 0 :: (6, 6, 6)] + * Count: 3 -> [6 @ 0 :: (7, 7, 7)] [6 @ 1 :: (8, 8, 8)] [6 @ 2 :: (9, 9, 9)] + */ +void Tree::print_heap() +{ + cout << "Tree depth: " << this->tree_depth + 1 << ", Total nodes: " << this->total_nodes << ", Total leaves: " << this->total_leaves << endl; + for (lluint ii = 0; ii < this->total_nodes; ii++) + { // if (!this->heap[ii].is_empty) { - if (this->heap[ii].current_record) { - printf("%lld :: (%.4s, %.4s, %.4s)@(%d:%c)\n", - ii, this->heap[ii].current_record->_record[0], - this->heap[ii].current_record->_record[1], - this->heap[ii].current_record->_record[2], - this->heap[ii].current_record->ovc, - this->heap[ii].current_record->rel); - } else { - RecordList *heap_list = this->heap[ii].list; - if (heap_list == NULL) { - printf("\n(%lld Empty )\n", ii); - continue; - } - lluint jj = 0; - printf("\n(%lld (Count: %lld) -> ", ii, heap_list->record_count); - for (auto current_record : heap_list->record_ptr) { - printf("[%lld @ %lld :: (%.4s, %.4s, %.4s)@(%d:%c)] ", - ii, jj, current_record._record[0], - current_record._record[1], - current_record._record[2], - current_record.ovc, - current_record.rel); - jj++; - } - printf(")\n"); + if (this->heap[ii].current_record) + { + printf("%lld :: (%.4s, %.4s, %.4s)@(%d:%c)\n", + ii, this->heap[ii].current_record->_record[0], + this->heap[ii].current_record->_record[1], + this->heap[ii].current_record->_record[2], + this->heap[ii].current_record->ovc, + this->heap[ii].current_record->rel); + } + else + { + RecordList *heap_list = this->heap[ii].list; + if (heap_list == NULL) + { + printf("\n(%lld Empty )\n", ii); + continue; + } + lluint jj = 0; + printf("\n(%lld (Count: %lld) -> ", ii, heap_list->record_count); + for (auto current_record : heap_list->record_ptr) + { + printf("[%lld @ %lld :: (%.4s, %.4s, %.4s)@(%d:%c)] ", + ii, jj, current_record._record[0], + current_record._record[1], + current_record._record[2], + current_record.ovc, + current_record.rel); + jj++; } + printf(")\n"); + } // } else { // printf("\n(%lld Empty )\n", ii); // } } } -vector Tree::get_empty_leaves() { +vector Tree::get_empty_leaves() +{ vector empty_leaf_idx_list; llint first_leaf_idx = pow(2, this->tree_depth) - 1; - for (lluint ii = first_leaf_idx; ii < this->total_nodes; ii++) { + for (lluint ii = first_leaf_idx; ii < this->total_nodes; ii++) + { if ((this->heap[ii].is_empty) && - (this->heap[ii].list->record_ptr.empty())) { - empty_leaf_idx_list.push_back(ii); + (this->heap[ii].list->record_ptr.empty())) + { + empty_leaf_idx_list.push_back(ii); } } return empty_leaf_idx_list; } /* -* Add new records at a leaf node (only if the existing list is exhausted) -*/ -llint Tree::add_run_at_leaf(llint leaf_node_index, DataRecord *record_list, llint record_ct) { + * Add new records at a leaf node (only if the existing list is exhausted) + */ +llint Tree::add_run_at_leaf(llint leaf_node_index, DataRecord *record_list, llint record_ct) +{ this->heap[leaf_node_index].is_empty = false; - if (this->heap[leaf_node_index].list == NULL) { + if (this->heap[leaf_node_index].list == NULL) + { this->heap[leaf_node_index].list = new RecordList; } - for (llint ii = 0 ; ii < record_ct; ii++) { + for (llint ii = 0; ii < record_ct; ii++) + { this->heap[leaf_node_index].list->record_ptr.push_back(*record_list); record_list++; } @@ -410,31 +499,35 @@ llint Tree::add_run_at_leaf(llint leaf_node_index, DataRecord *record_list, llin return 0; } -void Tree::spillover_run() { - for (auto a: this->generated_run) { +void Tree::spillover_run() +{ + for (auto a : this->generated_run) + { delete a; } this->generated_run.clear(); } /* -* Prints a sorted run -*/ -void Tree::print_run() { - for (auto a: this->generated_run) { + * Prints a sorted run + */ +void Tree::print_run() +{ + for (auto a : this->generated_run) + { a->print(); } return; } -vector Tree::get_generated_run() +vector Tree::get_generated_run() { return this->generated_run; } -Tree::~Tree () +Tree::~Tree() { - delete [] this->heap; + delete[] this->heap; TRACE(ENABLE_TRACE); } diff --git a/External-Sort/Tree.h b/External-Sort/Tree.h index 563269a..abb6e40 100755 --- a/External-Sort/Tree.h +++ b/External-Sort/Tree.h @@ -53,7 +53,7 @@ class Tree // Tournament tree functions void run_tree(); - void compare_and_swap(llint parent, llint unused_leaf_idx); + void run_tournament(llint parent, llint unused_leaf_idx); vector get_empty_leaves(); llint add_run_at_leaf(llint leaf_node_idx, DataRecord *sorted_run, llint number_of_records); void spillover_run(); diff --git a/README.md b/README.md index 960023f..58e2596 100644 --- a/README.md +++ b/README.md @@ -9,20 +9,20 @@ External Sorting algorithm for Databases having constrained storage hierarchy # Techniques Implemented by our submission and the corresponding Source Files and Lines -- Tournament trees [5] -- Offset-value coding [5] +- Tournament trees [5]: File Tree.cpp @ Line 196 +- Offset-value coding [5]: File DataRecord.cpp @ Line 122 - Minimum count of row & column comparisons [5] -- Cache-size mini runs [5] -- Device-optimized page sizes [5] -- Spilling memory-to-SSD [5] -- Spilling from SSD to disk [5] -- Graceful degradation - - Into merging [5] +- Cache-size mini runs [5]: File SortRecords.cpp @ Line 26 +- Device-optimized page sizes [5]: File SortRecords.cpp @ Line 81 and Line 136 +- Spilling memory-to-SSD [5]: File SortRecords.cpp @ Line 65 +- Spilling from SSD to disk [5]: File SortRecords.cpp @ Line 69 and Line 125 +- Graceful degradation: File SortRecords.cpp @ Line 72, Line 74 and Line 151 + - Into merging [5] - Beyond one merge step [5] -- Optimized merge patterns [5] -- Verifying - - sets of rows & values [5] - - sort order [5] +- Optimized merge patterns [5]: File SortRecords.cpp @ Line 150 and Line 151 +- Verifying: File Iterator.cpp @ Line 69 and Line 84 + - sets of rows & values [5]: File Iterator.cpp @ Line 84 + - sort order [5]: File Iterator.cpp @ Line 69 - Replacement selection? - Run size > memory size? @@ -32,9 +32,45 @@ External Sorting algorithm for Databases having constrained storage hierarchy - Quicksort -# Reasons we chose to implement the specific subset of techniques -# Project's state(complete or have what kinds of bugs) +# Reasons we chose to implement the specific subset of techniques +- `Tournament-tree priority queue` was used in order to achieve `high fan-in` for merging our sorted run inputs of records +- `Offset-value coding` was used to achieve `minimum column value comparisons` +- `Cache-size mini runs` were used to be able to fit the sort inputs, for tournament-tree, in the cache. This enabled us to leverage the low-latency accesses when there are `cache hits` +- `Device-optimized page sizes` were used in order to being cognizant about the `access-profile(latency, bandwidth)` of various devices in the storage hierarchy. For `SSD`, we used `8KB(100 MB/s * 0.1 ms ~ 10KB)` and for `HDD`, we used `1MB(100 MB/s * 10 ms ~ 1MB)` +- We achieved graceful-degradation by spilling `cache-size runs from cache to memory`, `spilling memory-size runs from memory to SSD` and `spilling SSD-size runs from SSD to HDD` +- Also `HDD-page size(1MB)` sorted runs were written to `SSD` prior to actually merging runs on the `HDD`. This is to leverage low-latency accesses of flash drives(SSD) +- `Sort-order`, `set of rows` and their `values` were verified as part of sorting the input records. This is to verify the `correctness` and `integrity` of our sort algorihthm + + +# Project's state +- The implementation of the `External-Sort` is complete with all of the techniques which were expected from us as part of the course project +- The sort was tested against `1KB` size records and with `12M` number of records(although it takes ~1hr to complete the sort, for this particular test-case) +- The sort algorithm was tested against `valgrind` to check for any memory leaks introduced while developing. The codebase does not have any memory leaks, from the latest leak-report on the most recent code version + # How to run our programs +- To run our program, first compile the source code using following command, under `External-Sort` directory +``` +$ cd External-Sort +$ make ExternalSort.exe +``` +- After compiling the source code, to execute the External Sort with custom arguments, run following command inside `External-Sort` directory +``` +# Where, +# "-c" gives the total number of records +# "-s" is the individual record size +# "-o" is the trace of your program run +$ ./ExternalSort.exe -c 120 -s 10 -o trace0.txt +``` + +- The program creates three directories on the completion of the sort algorithm: + - `input`: This directory consist of the input table which has records generated by the random-generator in arbitrary order + - `output`: This directory consist of the output table which has records from input table but in a sorted order, sorted using our sort algorithm + - `trace`: This directory consists of trace files generated from the sort. The trace file consists of logs related SSD and HDD device accesses. And the logs related to sort state machine + +- In order to remove all the generated binaries, executables, and the utility directories mentioned above, run the following command +``` +$ make clean +``` # Initial Setup ``` diff --git a/TODO b/TODO index 6376d9b..e69de29 100644 --- a/TODO +++ b/TODO @@ -1,4 +0,0 @@ -- Changes in pick(Sahil) -- Fix sort order for large records, probable problem with OVC(Sahil) -- Add README documentation as mentioned in the canvas(Divy) -- Sort is crashing for larger runs, might be because of memory starvation due to a leak(Divy) \ No newline at end of file From 4642283e70947f4d2c9e99a8577ad0ba64bcbb4c Mon Sep 17 00:00:00 2001 From: Divy Patel Date: Wed, 6 Dec 2023 15:12:54 -0600 Subject: [PATCH 3/5] feat: add ExternalSort project documentation adhereing to canvas course project instructions Signed-off-by: Divy Patel --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 58e2596..6bcde83 100644 --- a/README.md +++ b/README.md @@ -9,20 +9,20 @@ External Sorting algorithm for Databases having constrained storage hierarchy # Techniques Implemented by our submission and the corresponding Source Files and Lines -- Tournament trees [5]: File Tree.cpp @ Line 196 -- Offset-value coding [5]: File DataRecord.cpp @ Line 122 -- Minimum count of row & column comparisons [5] -- Cache-size mini runs [5]: File SortRecords.cpp @ Line 26 -- Device-optimized page sizes [5]: File SortRecords.cpp @ Line 81 and Line 136 -- Spilling memory-to-SSD [5]: File SortRecords.cpp @ Line 65 -- Spilling from SSD to disk [5]: File SortRecords.cpp @ Line 69 and Line 125 -- Graceful degradation: File SortRecords.cpp @ Line 72, Line 74 and Line 151 - - Into merging [5] - - Beyond one merge step [5] -- Optimized merge patterns [5]: File SortRecords.cpp @ Line 150 and Line 151 -- Verifying: File Iterator.cpp @ Line 69 and Line 84 - - sets of rows & values [5]: File Iterator.cpp @ Line 84 - - sort order [5]: File Iterator.cpp @ Line 69 +- **Tournament trees**: `File Tree.cpp @ Line 196` +- **Offset-value coding**: `File DataRecord.cpp @ Line 122` +- **Minimum count of row & column comparisons** +- **Cache-size mini runs**: `File SortRecords.cpp @ Line 26` +- **Device-optimized page sizes**: `File SortRecords.cpp @ Line 81 and Line 136` +- **Spilling memory-to-SSD**: `File SortRecords.cpp @ Line 65` +- **Spilling from SSD to disk**: `File SortRecords.cpp @ Line 69 and Line 125` +- **Graceful degradation**: `File SortRecords.cpp @ Line 72, Line 74 and Line 151` + - **Into merging** + - **Beyond one merge step** +- **Optimized merge patterns**: `File SortRecords.cpp @ Line 150 and Line 151` +- **Verifying**: `File Iterator.cpp @ Line 69 and Line 84` + - **sets of rows & values**: `File Iterator.cpp @ Line 84` + - **sort order**: `File Iterator.cpp @ Line 69` - Replacement selection? - Run size > memory size? From 01aa5f83d00824956e557d0454a1f66c34bf8afc Mon Sep 17 00:00:00 2001 From: Divy Patel Date: Wed, 6 Dec 2023 16:26:40 -0600 Subject: [PATCH 4/5] feat: add ExternalSort project documentation adhereing to canvas course project instructions Signed-off-by: Divy Patel --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6bcde83..2b99daf 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,12 @@ External Sorting algorithm for Databases having constrained storage hierarchy - Devaki Kulkarni (9086222321) dgkulkarni2@wisc.edu - Manaswini Gogineni (9085432699) mgogineni@wisc.edu +# Individual Contributions +__Divy__: Cache-size mini runs, Device-optimized page sizes, Spilling memory-to-SSD, Spilling from SSD to disk, Graceful degradation, Optimized merge patterns, Testing and Memory Leak Check +__Sahil__: Tournament trees, Offset-value coding, Minimum count of row & column comparisons, Optimized merge patterns, Large-size records, Testing and Memory Leak Check +__Devaki__: Tournament trees, Offset-value coding, Large-size records +__Manaswini__: Verification + # Techniques Implemented by our submission and the corresponding Source Files and Lines - **Tournament trees**: `File Tree.cpp @ Line 196` @@ -33,7 +39,7 @@ External Sorting algorithm for Databases having constrained storage hierarchy # Reasons we chose to implement the specific subset of techniques -- `Tournament-tree priority queue` was used in order to achieve `high fan-in` for merging our sorted run inputs of records +- `Tournament-tree priority queue` was used in order to achieve `high fan-in` for merging our sorted run inputs of records and less number of comparisons than a standard tree-of-winners - `Offset-value coding` was used to achieve `minimum column value comparisons` - `Cache-size mini runs` were used to be able to fit the sort inputs, for tournament-tree, in the cache. This enabled us to leverage the low-latency accesses when there are `cache hits` - `Device-optimized page sizes` were used in order to being cognizant about the `access-profile(latency, bandwidth)` of various devices in the storage hierarchy. For `SSD`, we used `8KB(100 MB/s * 0.1 ms ~ 10KB)` and for `HDD`, we used `1MB(100 MB/s * 10 ms ~ 1MB)` From 206bb0ad3a517998cbe67ac69f87faba8257d963 Mon Sep 17 00:00:00 2001 From: Divy Patel Date: Wed, 6 Dec 2023 16:27:34 -0600 Subject: [PATCH 5/5] feat: add ExternalSort project documentation adhereing to canvas course project instructions Signed-off-by: Divy Patel --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2b99daf..b7be76a 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,10 @@ External Sorting algorithm for Databases having constrained storage hierarchy - Manaswini Gogineni (9085432699) mgogineni@wisc.edu # Individual Contributions -__Divy__: Cache-size mini runs, Device-optimized page sizes, Spilling memory-to-SSD, Spilling from SSD to disk, Graceful degradation, Optimized merge patterns, Testing and Memory Leak Check -__Sahil__: Tournament trees, Offset-value coding, Minimum count of row & column comparisons, Optimized merge patterns, Large-size records, Testing and Memory Leak Check -__Devaki__: Tournament trees, Offset-value coding, Large-size records -__Manaswini__: Verification +- __Divy__: Cache-size mini runs, Device-optimized page sizes, Spilling memory-to-SSD, Spilling from SSD to disk, Graceful degradation, Optimized merge patterns, Testing and Memory Leak Check +- __Sahil__: Tournament trees, Offset-value coding, Minimum count of row & column comparisons, Optimized merge patterns, Large-size records, Testing and Memory Leak Check +- __Devaki__: Tournament trees, Offset-value coding, Large-size records +- __Manaswini__: Verification # Techniques Implemented by our submission and the corresponding Source Files and Lines