diff options
author | Igor Pashev <pashev.igor@gmail.com> | 2014-09-30 18:22:54 +0400 |
---|---|---|
committer | Igor Pashev <pashev.igor@gmail.com> | 2014-09-30 18:22:54 +0400 |
commit | 08bc9e01c274a01d107b348f921e1c74dd04bd3a (patch) | |
tree | 25348bff03c29d9dd6c6dd96bf82c7c9f9265ccf /src/cut.c | |
parent | b9c7373f203ab77c58cb6b131f8b58236ea337a2 (diff) | |
parent | c18578632fd3c9e513e613a86ba2b7c4ebee6c45 (diff) | |
download | coreutils-08bc9e01c274a01d107b348f921e1c74dd04bd3a.tar.gz |
Merge tag 'upstream/8.23'
Upstream version 8.23
Diffstat (limited to 'src/cut.c')
-rw-r--r-- | src/cut.c | 436 |
1 files changed, 182 insertions, 254 deletions
@@ -1,5 +1,5 @@ /* cut - remove parts of lines of files - Copyright (C) 1997-2013 Free Software Foundation, Inc. + Copyright (C) 1997-2014 Free Software Foundation, Inc. Copyright (C) 1984 David M. Ihnat This program is free software: you can redistribute it and/or modify @@ -53,24 +53,6 @@ } \ while (0) -/* Append LOW, HIGH to the list RP of range pairs, allocating additional - space if necessary. Update local variable N_RP. When allocating, - update global variable N_RP_ALLOCATED. */ - -#define ADD_RANGE_PAIR(rp, low, high) \ - do \ - { \ - if (low == 0 || high == 0) \ - FATAL_ERROR (_("fields and positions are numbered from 1")); \ - if (n_rp >= n_rp_allocated) \ - { \ - (rp) = X2NREALLOC (rp, &n_rp_allocated); \ - } \ - rp[n_rp].lo = (low); \ - rp[n_rp].hi = (high); \ - ++n_rp; \ - } \ - while (0) struct range_pair { @@ -78,6 +60,36 @@ struct range_pair size_t hi; }; +/* Array of `struct range_pair' holding all the finite ranges. */ +static struct range_pair *rp; + +/* Pointer inside RP. When checking if a byte or field is selected + by a finite range, we check if it is between CURRENT_RP.LO + and CURRENT_RP.HI. If the byte or field index is greater than + CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */ +static struct range_pair *current_rp; + +/* Number of finite ranges specified by the user. */ +static size_t n_rp; + +/* Number of `struct range_pair's allocated. */ +static size_t n_rp_allocated; + + +/* Append LOW, HIGH to the list RP of range pairs, allocating additional + space if necessary. Update global variable N_RP. When allocating, + update global variable N_RP_ALLOCATED. */ + +static void +add_range_pair (size_t lo, size_t hi) +{ + if (n_rp == n_rp_allocated) + rp = X2NREALLOC (rp, &n_rp_allocated); + rp[n_rp].lo = lo; + rp[n_rp].hi = hi; + ++n_rp; +} + /* This buffer is used to support the semantics of the -s option (or lack of same) when the specified field list includes (does not include) the first field. In both of those cases, the entire @@ -90,26 +102,6 @@ static char *field_1_buffer; /* The number of bytes allocated for FIELD_1_BUFFER. */ static size_t field_1_bufsize; -/* The largest field or byte index used as an endpoint of a closed - or degenerate range specification; this doesn't include the starting - index of right-open-ended ranges. For example, with either range spec - '2-5,9-', '2-3,5,9-' this variable would be set to 5. */ -static size_t max_range_endpoint; - -/* If nonzero, this is the index of the first field in a range that goes - to end of line. */ -static size_t eol_range_start; - -/* This is a bit vector. - In byte mode, which bytes to output. - In field mode, which DELIM-separated fields to output. - Both bytes and fields are numbered starting with 1, - so the zeroth bit of this array is unused. - A field or byte K has been selected if - (K <= MAX_RANGE_ENDPOINT and is_printable_field(K)) - || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */ -static unsigned char *printable_field; - enum operating_mode { undefined_mode, @@ -117,22 +109,22 @@ enum operating_mode /* Output characters that are in the given bytes. */ byte_mode, - /* Output the given delimeter-separated fields. */ + /* Output the given delimiter-separated fields. */ field_mode }; static enum operating_mode operating_mode; -/* If true do not output lines containing no delimeter characters. +/* If true do not output lines containing no delimiter characters. Otherwise, all such lines are printed. This option is valid only with field mode. */ static bool suppress_non_delimited; -/* If nonzero, print all bytes, characters, or fields _except_ +/* If true, print all bytes, characters, or fields _except_ those that were specified. */ static bool complement; -/* The delimeter character for field mode. */ +/* The delimiter character for field mode. */ static unsigned char delim; /* True if the --output-delimiter=STRING option was specified. */ @@ -148,15 +140,6 @@ static char *output_delimiter_string; /* True if we have ever read standard input. */ static bool have_read_stdin; -#define HT_RANGE_START_INDEX_INITIAL_CAPACITY 31 - -/* The set of range-start indices. For example, given a range-spec list like - '-b1,3-5,4-9,15-', the following indices will be recorded here: 1, 3, 15. - Note that although '4' looks like a range-start index, it is in the middle - of the '3-5' range, so it doesn't count. - This table is created/used IFF output_delimiter_specified is set. */ -static Hash_table *range_start_ht; - /* For long options that have no equivalent short option, use a non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum @@ -239,103 +222,57 @@ With no FILE, or when FILE is -, read standard input.\n\ exit (status); } -static inline void -mark_range_start (size_t i) -{ - /* Record the fact that 'i' is a range-start index. */ - void *ent_from_table = hash_insert (range_start_ht, (void*) i); - if (ent_from_table == NULL) - { - /* Insertion failed due to lack of memory. */ - xalloc_die (); - } - assert ((size_t) ent_from_table == i); -} - -static inline void -mark_printable_field (size_t i) -{ - size_t n = i / CHAR_BIT; - printable_field[n] |= (1 << (i % CHAR_BIT)); -} - -static inline bool -is_printable_field (size_t i) +/* Comparison function for qsort to order the list of + struct range_pairs. */ +static int +compare_ranges (const void *a, const void *b) { - size_t n = i / CHAR_BIT; - return (printable_field[n] >> (i % CHAR_BIT)) & 1; + int a_start = ((const struct range_pair *) a)->lo; + int b_start = ((const struct range_pair *) b)->lo; + return a_start < b_start ? -1 : a_start > b_start; } -static size_t -hash_int (const void *x, size_t tablesize) -{ -#ifdef UINTPTR_MAX - uintptr_t y = (uintptr_t) x; -#else - size_t y = (size_t) x; -#endif - return y % tablesize; -} +/* Reallocate Range Pair entries, with corresponding + entries outside the range of each specified entry. */ -static bool -hash_compare_ints (void const *x, void const *y) +static void +complement_rp (void) { - return (x == y) ? true : false; -} + if (complement) + { + struct range_pair *c = rp; + size_t n = n_rp; + size_t i; -static bool -is_range_start_index (size_t i) -{ - return hash_lookup (range_start_ht, (void *) i) ? true : false; -} + rp = NULL; + n_rp = 0; + n_rp_allocated = 0; -/* Return nonzero if the K'th field or byte is printable. - When returning nonzero, if RANGE_START is non-NULL, - set *RANGE_START to true if K is the beginning of a range, and to - false otherwise. */ + if (c[0].lo > 1) + add_range_pair (1, c[0].lo - 1); -static bool -print_kth (size_t k, bool *range_start) -{ - bool k_selected - = ((0 < eol_range_start && eol_range_start <= k) - || (k <= max_range_endpoint && is_printable_field (k))); + for (i = 1; i < n; ++i) + { + if (c[i-1].hi + 1 == c[i].lo) + continue; - bool is_selected = k_selected ^ complement; - if (range_start && is_selected) - *range_start = is_range_start_index (k); + add_range_pair (c[i-1].hi + 1, c[i].lo - 1); + } - return is_selected; -} + if (c[n-1].hi < SIZE_MAX) + add_range_pair (c[n-1].hi + 1, SIZE_MAX); -/* Comparison function for qsort to order the list of - struct range_pairs. */ -static int -compare_ranges (const void *a, const void *b) -{ - int a_start = ((const struct range_pair *) a)->lo; - int b_start = ((const struct range_pair *) b)->lo; - return a_start < b_start ? -1 : a_start > b_start; + free (c); + } } -/* Given the list of field or byte range specifications FIELDSTR, set - MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD - array. If there is a right-open-ended range, set EOL_RANGE_START - to its starting index. FIELDSTR should be composed of one or more - numbers or ranges of numbers, separated by blanks or commas. - Incomplete ranges may be given: '-m' means '1-m'; 'n-' means 'n' - through end of line. Return true if FIELDSTR contains at least - one field specification, false otherwise. */ - -/* FIXME-someday: What if the user wants to cut out the 1,000,000-th - field of some huge input file? This function shouldn't have to - allocate a table of a million bits just so we can test every - field < 10^6 with an array dereference. Instead, consider using - an adaptive approach: if the range of selected fields is too large, - but only a few fields/byte-offsets are actually selected, use a - hash table. If the range of selected fields is too large, and - too many are selected, then resort to using the range-pairs (the - 'rp' array) directly. */ +/* Given the list of field or byte range specifications FIELDSTR, + allocate and initialize the RP array. FIELDSTR should + be composed of one or more numbers or ranges of numbers, separated + by blanks or commas. Incomplete ranges may be given: '-m' means '1-m'; + 'n-' means 'n' through end of line. + Return true if FIELDSTR contains at least one field specification, + false otherwise. */ static bool set_fields (const char *fieldstr) @@ -348,14 +285,10 @@ set_fields (const char *fieldstr) bool field_found = false; /* True if at least one field spec has been processed. */ - struct range_pair *rp = NULL; - size_t n_rp = 0; - size_t n_rp_allocated = 0; size_t i; bool in_digits = false; - /* Collect and store in RP the range end points. - It also sets EOL_RANGE_START if appropriate. */ + /* Collect and store in RP the range end points. */ while (true) { @@ -390,10 +323,8 @@ set_fields (const char *fieldstr) In any case, 'initial' contains the start of the range. */ if (!rhs_specified) { - /* 'n-'. From 'initial' to end of line. If we've already - seen an M- range, ignore subsequent N- unless N < M. */ - if (eol_range_start == 0 || initial < eol_range_start) - eol_range_start = initial; + /* 'n-'. From 'initial' to end of line. */ + add_range_pair (initial, SIZE_MAX); field_found = true; } else @@ -402,54 +333,23 @@ set_fields (const char *fieldstr) if (value < initial) FATAL_ERROR (_("invalid decreasing range")); - /* Is there already a range going to end of line? */ - if (eol_range_start != 0) - { - /* Yes. Is the new sequence already contained - in the old one? If so, no processing is - necessary. */ - if (initial < eol_range_start) - { - /* No, the new sequence starts before the - old. Does the old range going to end of line - extend into the new range? */ - if (eol_range_start <= value) - { - /* Yes. Simply move the end of line marker. */ - eol_range_start = initial; - } - else - { - /* No. A simple range, before and disjoint from - the range going to end of line. Fill it. */ - ADD_RANGE_PAIR (rp, initial, value); - } - - /* In any case, some fields were selected. */ - field_found = true; - } - } - else - { - /* There is no range going to end of line. */ - ADD_RANGE_PAIR (rp, initial, value); - field_found = true; - } - value = 0; + add_range_pair (initial, value); + field_found = true; } + value = 0; } else { /* A simple field number, not a range. */ - ADD_RANGE_PAIR (rp, value, value); + if (value == 0) + FATAL_ERROR (_("fields and positions are numbered from 1")); + add_range_pair (value, value); value = 0; field_found = true; } if (*fieldstr == '\0') - { - break; - } + break; fieldstr++; lhs_specified = false; @@ -470,7 +370,8 @@ set_fields (const char *fieldstr) lhs_specified = 1; /* Detect overflow. */ - if (!DECIMAL_DIGIT_ACCUMULATE (value, *fieldstr - '0', size_t)) + if (!DECIMAL_DIGIT_ACCUMULATE (value, *fieldstr - '0', size_t) + || value == SIZE_MAX) { /* In case the user specified -c$(echo 2^64|bc),22, complain only about the first number. */ @@ -493,51 +394,62 @@ set_fields (const char *fieldstr) FATAL_ERROR (_("invalid byte, character or field list")); } - max_range_endpoint = 0; - for (i = 0; i < n_rp; i++) + qsort (rp, n_rp, sizeof (rp[0]), compare_ranges); + + /* Merge range pairs (e.g. `2-5,3-4' becomes `2-5'). */ + for (i = 0; i < n_rp; ++i) { - if (rp[i].hi > max_range_endpoint) - max_range_endpoint = rp[i].hi; + for (size_t j = i + 1; j < n_rp; ++j) + { + if (rp[j].lo <= rp[i].hi) + { + rp[i].hi = MAX (rp[j].hi, rp[i].hi); + memmove (rp + j, rp + j + 1, (n_rp - j - 1) * sizeof *rp); + n_rp--; + j--; + } + else + break; + } } - /* Allocate an array large enough so that it may be indexed by - the field numbers corresponding to all finite ranges - (i.e. '2-6' or '-4', but not '5-') in FIELDSTR. */ + complement_rp (); - if (max_range_endpoint) - printable_field = xzalloc (max_range_endpoint / CHAR_BIT + 1); + /* After merging, reallocate RP so we release memory to the system. + Also add a sentinel at the end of RP, to avoid out of bounds access + and for performance reasons. */ + ++n_rp; + rp = xrealloc (rp, n_rp * sizeof (struct range_pair)); + rp[n_rp - 1].lo = rp[n_rp - 1].hi = SIZE_MAX; - qsort (rp, n_rp, sizeof (rp[0]), compare_ranges); + return field_found; +} - /* Set the array entries corresponding to integers in the ranges of RP. */ - for (i = 0; i < n_rp; i++) - { - /* Ignore any range that is subsumed by the to-EOL range. */ - if (eol_range_start && eol_range_start <= rp[i].lo) - continue; - - /* Record the range-start indices, i.e., record each start - index that is not part of any other (lo..hi] range. */ - size_t rsi_candidate = complement ? rp[i].hi + 1 : rp[i].lo; - if (output_delimiter_specified - && !is_printable_field (rsi_candidate)) - mark_range_start (rsi_candidate); - - for (size_t j = rp[i].lo; j <= rp[i].hi; j++) - mark_printable_field (j); - } +/* Increment *ITEM_IDX (i.e. a field or byte index), + and if required CURRENT_RP. */ - if (output_delimiter_specified - && !complement - && eol_range_start - && max_range_endpoint - && (max_range_endpoint < eol_range_start - || !is_printable_field (eol_range_start))) - mark_range_start (eol_range_start); +static inline void +next_item (size_t *item_idx) +{ + (*item_idx)++; + if ((*item_idx) > current_rp->hi) + current_rp++; +} - free (rp); +/* Return nonzero if the K'th field or byte is printable. */ - return field_found; +static inline bool +print_kth (size_t k) +{ + return current_rp->lo <= k; +} + +/* Return nonzero if K'th byte is the beginning of a range. */ + +static inline bool +is_range_start_index (size_t k) +{ + return k == current_rp->lo; } /* Read from stream STREAM, printing to standard output any selected bytes. */ @@ -552,7 +464,8 @@ cut_bytes (FILE *stream) byte_idx = 0; print_delimiter = false; - while (1) + current_rp = rp; + while (true) { int c; /* Each character from the file. */ @@ -563,6 +476,7 @@ cut_bytes (FILE *stream) putchar ('\n'); byte_idx = 0; print_delimiter = false; + current_rp = rp; } else if (c == EOF) { @@ -572,16 +486,19 @@ cut_bytes (FILE *stream) } else { - bool range_start; - bool *rs = output_delimiter_specified ? &range_start : NULL; - if (print_kth (++byte_idx, rs)) + next_item (&byte_idx); + if (print_kth (byte_idx)) { - if (rs && *rs && print_delimiter) + if (output_delimiter_specified) { - fwrite (output_delimiter_string, sizeof (char), - output_delimiter_length, stdout); + if (print_delimiter && is_range_start_index (byte_idx)) + { + fwrite (output_delimiter_string, sizeof (char), + output_delimiter_length, stdout); + } + print_delimiter = true; } - print_delimiter = true; + putchar (c); } } @@ -598,6 +515,8 @@ cut_fields (FILE *stream) bool found_any_selected_field = false; bool buffer_first_field; + current_rp = rp; + c = getc (stream); if (c == EOF) return; @@ -611,7 +530,7 @@ cut_fields (FILE *stream) and the first field has been selected, or if non-delimited lines must be suppressed and the first field has *not* been selected. That is because a non-delimited line has exactly one field. */ - buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL)); + buffer_first_field = (suppress_non_delimited ^ !print_kth (1)); while (1) { @@ -619,7 +538,6 @@ cut_fields (FILE *stream) { ssize_t len; size_t n_bytes; - bool got_line; len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0, GETNLINE_NO_LIMIT, delim, '\n', stream); @@ -636,14 +554,13 @@ cut_fields (FILE *stream) assert (n_bytes != 0); c = 0; - got_line = field_1_buffer[n_bytes - 1] == '\n'; /* If the first field extends to the end of line (it is not delimited) and we are printing all non-delimited lines, print this one. */ - if (to_uchar (field_1_buffer[n_bytes - 1]) != delim || got_line) + if (to_uchar (field_1_buffer[n_bytes - 1]) != delim) { - if (suppress_non_delimited && !(got_line && delim == '\n')) + if (suppress_non_delimited) { /* Empty. */ } @@ -651,24 +568,36 @@ cut_fields (FILE *stream) { fwrite (field_1_buffer, sizeof (char), n_bytes, stdout); /* Make sure the output line is newline terminated. */ - if (! got_line) + if (field_1_buffer[n_bytes - 1] != '\n') putchar ('\n'); c = '\n'; } continue; } - if (print_kth (1, NULL)) + if (print_kth (1)) { /* Print the field, but not the trailing delimiter. */ fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout); - found_any_selected_field = true; + + /* With -d$'\n' don't treat the last '\n' as a delimiter. */ + if (delim == '\n') + { + int last_c = getc (stream); + if (last_c != EOF) + { + ungetc (last_c, stream); + found_any_selected_field = true; + } + } + else + found_any_selected_field = true; } - ++field_idx; + next_item (&field_idx); } int prev_c = c; - if (print_kth (field_idx, NULL)) + if (print_kth (field_idx)) { if (found_any_selected_field) { @@ -691,21 +620,32 @@ cut_fields (FILE *stream) } } - if (c == '\n' || c == EOF) + /* With -d$'\n' don't treat the last '\n' as a delimiter. */ + if (delim == '\n' && c == delim) + { + int last_c = getc (stream); + if (last_c != EOF) + ungetc (last_c, stream); + else + c = last_c; + } + + if (c == delim) + next_item (&field_idx); + else if (c == '\n' || c == EOF) { if (found_any_selected_field || !(suppress_non_delimited && field_idx == 1)) { - if (c == '\n' || prev_c != '\n') + if (c == '\n' || prev_c != '\n' || delim == '\n') putchar ('\n'); } if (c == EOF) break; field_idx = 1; + current_rp = rp; found_any_selected_field = false; } - else if (c == delim) - field_idx++; } } @@ -854,16 +794,6 @@ main (int argc, char **argv) FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\ \tonly when operating on fields")); - if (output_delimiter_specified) - { - range_start_ht = hash_initialize (HT_RANGE_START_INDEX_INITIAL_CAPACITY, - NULL, hash_int, - hash_compare_ints, NULL); - if (range_start_ht == NULL) - xalloc_die (); - - } - if (! set_fields (spec_list_string)) { if (operating_mode == field_mode) @@ -890,8 +820,6 @@ main (int argc, char **argv) for (ok = true; optind < argc; optind++) ok &= cut_file (argv[optind]); - if (range_start_ht) - hash_free (range_start_ht); if (have_read_stdin && fclose (stdin) == EOF) { |