/* sort - sort a file of lines Author: Michiel Huisjes */ /* SYNOPSIS: * sort [-funbirdcmt'x'] [+beg_pos[opts] [-end_pos]] [-o outfile] [file].. * * [opts] can be any of * -f : Fold upper case to lower. * -n : Sort to numeric value (optional decimal point) implies -b * -b : Skip leading blanks * -i : Ignore chars outside ASCII range (040 - 0176) * -r : Reverse the sense of comparisons. * -d : Sort to dictionary order. Only letters, digits, comma's and points * are compared. * If any of these flags are used in [opts], then they override all global * ordering for this field. * * I/O control flags are: * -u : Print uniq lines only once. * -c : Check if files are sorted in order. * -m : Merge already sorted files. * -o outfile : Name of output file. (Can be one of the input files). * Default is stdout. * - : Take stdin as input. * * Fields: * -t'x' : Field separating character is 'x' * +a.b : Start comparing at field 'a' with offset 'b'. A missing 'b' is * taken to be 0. * -a.b : Stop comparing at field 'a' with offset 'b'. A missing 'b' is * taken to be 0. * A missing -a.b means the rest of the line. */ #include #include #include #include #include #include #include #include #include #define OPEN_FILES (OPEN_MAX-4) /* Nr of open files per process */ #if __minix_vmd #define MEMORY_SIZE (1024 * 1024) #else #define MEMORY_SIZE ((10 * sizeof(int)) * 1024) #endif /* Total mem_size */ #define LINE_SIZE (1024 >> 1) /* Max length of a line */ #define IO_SIZE (2 * 1024) /* Size of buffered output */ #define STD_OUT 1 /* Fd of terminal */ /* Return status of functions */ #define OK 0 #define ERROR -1 #define NIL_PTR ((char *) 0) /* Compare return values */ #define LOWER -1 #define SAME 0 #define HIGHER 1 /* Table definitions. */ #define DICT 0x001 /* Alpha, numeric, letters and . */ #define ASCII 0x002 /* All between ' ' and '~' */ #define BLANK 0x004 /* ' ' and '\t' */ #define DIGIT 0x008 /* 0-9 */ #define UPPER 0x010 /* A-Z */ typedef int BOOL; #define FALSE 0 #define TRUE 1 typedef struct { int fd; /* Fd of file */ char *buffer; /* Buffer for reads */ int read_chars; /* Nr of chars actually read in buffer */ int cnt; /* Nr of chars taken out of buffer */ char *line; /* Contains line currently used */ } MERGE; #define NIL_MERGE ((MERGE *) 0) MERGE merge_f[OPEN_FILES]; /* Merge structs */ int buf_size; /* Size of core available for each struct */ #define FIELDS_LIMIT 10 /* 1 global + 9 user */ #define GLOBAL 0 typedef struct { int beg_field, beg_pos; /* Begin field + offset */ int end_field, end_pos; /* End field + offset. ERROR == EOLN */ BOOL reverse; /* TRUE if rev. flag set on this field */ BOOL blanks; BOOL dictionary; BOOL fold_case; BOOL ascii; BOOL numeric; } FIELD; /* Field declarations. A total of FILEDS_LIMIT is allowed */ FIELD fields[FIELDS_LIMIT]; int field_cnt; /* Nr of field actually assigned */ /* Various output control flags */ BOOL check = FALSE; BOOL only_merge = FALSE; BOOL uniq = FALSE; char *mem_top; /* Mem_top points to lowest pos of memory. */ char *cur_pos; /* First free position in mem */ char **line_table; /* Pointer to the internal line table */ BOOL in_core = TRUE; /* Set if input cannot all be sorted in core */ /* Place where temp_files should be made */ char temp_files[] = "/tmp/sort.XXXXX.XX"; char *output_file; /* Name of output file */ int out_fd; /* Fd to output file (could be STD_OUT) */ char out_buffer[IO_SIZE]; /* For buffered output */ char **argptr; /* Pointer to argv structure */ int args_offset; /* Nr of args spilled on options */ int args_limit; /* Nr of args given */ char separator; /* Char that separates fields */ int nr_of_files = 0; /* Nr_of_files to be merged */ int disabled; /* Nr of files done */ char USAGE[] = "Usage: sort [-funbirdcmt'x'] [+beg_pos [-end_pos]] [-o outfile] [file] .."; /* Forward declarations */ _PROTOTYPE(int main, (int argc, char **argv)); _PROTOTYPE(void get_opts, (char *ptr, FIELD * field)); _PROTOTYPE(void new_field, (FIELD * field, int *offset, BOOL beg_fl)); _PROTOTYPE(void adjust_options, (FIELD * field)); _PROTOTYPE(void error, (BOOL quit, char *message, char *arg)); _PROTOTYPE(void open_outfile, (void)); _PROTOTYPE(void get_file, (int fd, off_t size)); _PROTOTYPE(int last_line, (void)); _PROTOTYPE(void print_table, (int fd)); _PROTOTYPE(char *file_name, (int nr)); _PROTOTYPE(void mread, (int fd, char *address, int bytes)); _PROTOTYPE(void mwrite, (int fd, char *address, int bytes)); _PROTOTYPE(void sort, (void)); _PROTOTYPE(void sort_table, (int nel)); _PROTOTYPE(void incr, (int si, int ei)); _PROTOTYPE(int cmp_fields, (char *el1, char *el2)); _PROTOTYPE(void build_field, (char *dest, FIELD * field, char *src)); _PROTOTYPE(char *skip_fields, (char *str, int nf)); _PROTOTYPE(int compare, (char *el1, char *el2)); _PROTOTYPE(int cmp, (unsigned char *el1, unsigned char *el2, FIELD * field)); _PROTOTYPE(int digits, (char *str1, char *str2, BOOL check_sign)); _PROTOTYPE(void files_merge, (int file_cnt)); _PROTOTYPE(void merge, (int start_file, int limit_file)); _PROTOTYPE(void put_line, (char *line)); _PROTOTYPE(MERGE * print, (MERGE * merg, int file_cnt)); _PROTOTYPE(int read_line, (MERGE * merg)); _PROTOTYPE(MERGE * skip_lines, (MERGE * smallest, int file_cnt)); _PROTOTYPE(void uniq_lines, (MERGE * merg)); _PROTOTYPE(void check_file, (int fd, char *file)); _PROTOTYPE(int length, (char *line)); _PROTOTYPE(void copy, (char *dest, char *src)); _PROTOTYPE(char *msbrk, (int size)); _PROTOTYPE(void mbrk, (char *address)); _PROTOTYPE(void catch, (int dummy)); /* Table of all chars. 0 means no special meaning. */ char table[256] = { /* '^@' to space */ 0, 0, 0, 0, 0, 0, 0, 0, 0, BLANK | DICT, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Space to '0' */ BLANK | DICT | ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, /* '0' until '9' */ DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, /* ASCII from ':' to '@' */ ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, /* Upper case letters 'A' to 'Z' */ UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII, /* ASCII from '[' to '`' */ ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, /* Lower case letters from 'a' to 'z' */ DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII, /* ASCII from '{' to '~' */ ASCII, ASCII, ASCII, ASCII, /* Stuff from -1 to -177 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* * Get_opts () assigns the options into the field structure as described in ptr. * This field structure could be the GLOBAL one. */ void get_opts(ptr, field) register char *ptr; register FIELD *field; { switch (*ptr) { case 'b': /* Skip leading blanks */ field->blanks = TRUE; break; case 'd': /* Dictionary order */ field->dictionary = TRUE; break; case 'f': /* Fold upper case to lower */ field->fold_case = TRUE; break; case 'i': /* Skip chars outside ' ' '~' */ field->ascii = TRUE; break; case 'n': /* Sort on numeric */ field->numeric = TRUE; field->blanks = TRUE; break; case 'r': /* Reverse comparisons */ field->reverse = TRUE; break; default: /* Illegal options */ error(TRUE, USAGE, NIL_PTR); } } /* New_field () assigns a new field as described by the arguments. * A field description is of the form: +a.b[opts] -c.d, where b and d, as well * as -c.d and [opts] are optional. Nr before digit is field nr. Nr after digit * is offset from field. */ void new_field(field, offset, beg_fl) register FIELD *field; /* Field to assign */ int *offset; /* Offset in argv structure */ BOOL beg_fl; /* Assign beg or end of field */ { register char *ptr; ptr = argptr[*offset]; *offset += 1; /* Incr offset to next arg */ ptr++; if (beg_fl) field->beg_field = atoi(ptr); /* Assign int of first field */ else field->end_field = atoi(ptr); while (table[*ptr] & DIGIT) /* Skip all digits */ ptr++; if (*ptr == '.') { /* Check for offset */ ptr++; if (beg_fl) field->beg_pos = atoi(ptr); else field->end_pos = atoi(ptr); while (table[*ptr] & DIGIT) /* Skip digits */ ptr++; } if (beg_fl) { while (*ptr != '\0') /* Check options after field */ get_opts(ptr++, field); } if (beg_fl) { /* Check for end pos */ ptr = argptr[*offset]; if (ptr && *ptr == '-' && ((table[*(ptr + 1)] & DIGIT) || *(ptr + 1) == '.')) { new_field(field, offset, FALSE); if (field->beg_field > field->end_field) error(TRUE, "End field is before start field!", NIL_PTR); } else /* No end pos. */ field->end_field = ERROR; } } int main(argc, argv) int argc; char *argv[]; { int arg_count = 1; /* Offset in argv */ struct stat st; register char *ptr; /* Ptr to *argv in use */ register int fd; int pid, pow; argptr = argv; cur_pos = mem_top = msbrk(MEMORY_SIZE); /* Find lowest mem. location */ while (arg_count < argc && ((ptr = argv[arg_count])[0] == '-' || *ptr == '+')) { if (*ptr == '-' && *(ptr + 1) == '\0') /* "-" means stdin */ break; if (*ptr == '+') { /* Assign field. */ if (++field_cnt == FIELDS_LIMIT) error(TRUE, "Too many fields", NIL_PTR); new_field(&fields[field_cnt], &arg_count, TRUE); } else { /* Get output options */ while (*++ptr) { switch (*ptr) { case 'c': /* Only check file */ check = TRUE; break; case 'm': /* Merge (sorted) files */ only_merge = TRUE; break; case 'u': /* Only give uniq lines */ uniq = TRUE; break; case 'o': /* Name of output file */ output_file = argv[++arg_count]; break; case 't': /* Field separator */ ptr++; separator = *ptr; break; default: /* Sort options */ get_opts(ptr, &fields[GLOBAL]); } } arg_count++; } } for (fd = 1; fd <= field_cnt; fd++) adjust_options(&fields[fd]); /* Create name of tem_files 'sort.pid.aa' */ ptr = &temp_files[10]; pid = getpid(); pow = 10000; while (pow != 0) { *ptr++ = pid / pow + '0'; pid %= pow; pow /= 10; } signal(SIGINT, catch); /* Only merge files. Set up */ if (only_merge) { args_limit = args_offset = arg_count; while (argv[args_limit] != NIL_PTR) args_limit++; /* Find nr of args */ files_merge(args_limit - arg_count); exit(0); } if (arg_count == argc) { /* No args left. Use stdin */ if (check) check_file(0, NIL_PTR); else get_file(0, (off_t) 0); } else while (arg_count < argc) { /* Sort or check args */ if (strcmp(argv[arg_count], "-") == 0) fd = 0; else if (stat(argv[arg_count], &st) < 0) { error(FALSE, "Cannot find ", argv[arg_count++]); continue; } /* Open files */ else if ((fd = open(argv[arg_count], O_RDONLY)) < 0) { error(FALSE, "Cannot open ", argv[arg_count++]); continue; } if (check) check_file(fd, argv[arg_count]); else /* Get_file reads whole file */ get_file(fd, st.st_size); arg_count++; } if (check) exit(0); sort(); /* Sort whatever is left */ if (nr_of_files == 1) /* Only one file sorted -> don't merge */ exit(0); files_merge(nr_of_files); return(0); } /* Adjust_options() assigns all global variables set also in the fields * assigned. */ void adjust_options(field) register FIELD *field; { register FIELD *gfield = &fields[GLOBAL]; if (gfield->reverse) field->reverse = TRUE; if (gfield->blanks) field->blanks = TRUE; if (gfield->dictionary) field->dictionary = TRUE; if (gfield->fold_case) field->fold_case = TRUE; if (gfield->ascii) field->ascii = TRUE; if (gfield->numeric) field->numeric = TRUE; } /* Error () prints the error message on stderr and exits if quit == TRUE. */ void error(quit, message, arg) register BOOL quit; register char *message, *arg; { write(2, message, strlen(message)); if (arg != NIL_PTR) write(2, arg, strlen(arg)); perror(" "); if (quit) exit(1); } /* Open_outfile () assigns to out_fd the fd where the output must go when all * the sorting is done. */ void open_outfile() { if (output_file == NIL_PTR) out_fd = STD_OUT; else if ((out_fd = creat(output_file, 0644)) < 0) error(TRUE, "Cannot creat ", output_file); } /* Get_file reads the whole file of filedescriptor fd. If the file is too big * to keep in core, a partial sort is done, and the output is stashed somewhere. */ void get_file(fd, size) int fd; /* Fd of file to read */ register off_t size; /* Size of file */ { register int i; int rest; /* Rest in memory */ char save_ch; /* Used in stdin readings */ rest = MEMORY_SIZE - (cur_pos - mem_top); if (fd == 0) { /* We're reding stdin */ while ((i = read(0, cur_pos, rest)) > 0) { if ((cur_pos - mem_top) + i == MEMORY_SIZE) { in_core = FALSE; i = last_line(); /* End of last line */ save_ch = mem_top[i]; mem_top[i] = '\0'; sort(); /* Sort core */ mem_top[i] = save_ch; /* Restore erased char */ /* Restore last (half read) line */ for (rest = 0; i + rest != MEMORY_SIZE; rest++) mem_top[rest] = mem_top[i + rest]; /* Assign current pos. in memory */ cur_pos = &mem_top[rest]; } else { /* Fits, just assign position in mem. */ cur_pos = cur_pos + i; *cur_pos = '\0'; } /* Calculate rest of mem */ rest = MEMORY_SIZE - (cur_pos - mem_top); } } /* Reading file. Check size */ else if (size > rest) { /* Won't fit */ mread(fd, cur_pos, rest); in_core = FALSE; i = last_line(); /* Get pos. of last line */ mem_top[i] = '\0'; /* Truncate */ (void) lseek(fd, (off_t) (i - MEMORY_SIZE), SEEK_CUR); /* Do this next time */ size = size - rest - i + MEMORY_SIZE; /* Calculate rest */ cur_pos = mem_top; /* Reset mem */ sort(); /* Sort core */ get_file(fd, size); /* Get rest of file */ } else { /* Fits. Just read in */ rest = size; mread(fd, cur_pos, rest); cur_pos = cur_pos + rest; /* Reassign cur_pos */ *cur_pos = '\0'; (void) close(fd); /* File completed */ } } /* Last_line () find the last line in core and retuns the offset from the top * of the memory. */ int last_line() { register int i; for (i = MEMORY_SIZE - 2; i > 0; i--) if (mem_top[i] == '\n') break; return i + 1; } /* Print_table prints the line table in the given file_descriptor. If the fd * equals ERROR, it opens a temp_file itself. */ void print_table(fd) int fd; { register char **line_ptr; /* Ptr in line_table */ register char *ptr; /* Ptr to line */ int index = 0; /* Index in output buffer */ if (fd == ERROR) { if ((fd = creat(file_name(nr_of_files), 0644)) < 0) error(TRUE, "Cannot creat ", file_name(nr_of_files)); } for (line_ptr = line_table; *line_ptr != NIL_PTR; line_ptr++) { ptr = *line_ptr; /* Skip all same lines if uniq is set */ if (uniq && *(line_ptr + 1) != NIL_PTR) { if (compare(ptr, *(line_ptr + 1)) == SAME) continue; } do { /* Print line in a buffered way */ out_buffer[index++] = *ptr; if (index == IO_SIZE) { mwrite(fd, out_buffer, IO_SIZE); index = 0; } } while (*ptr++ != '\n'); } mwrite(fd, out_buffer, index);/* Flush buffer */ (void) close(fd); /* Close file */ nr_of_files++; /* Increment nr_of_files to merge */ } /* File_name () returns the nr argument from the argument list, or a uniq * filename if the nr is too high, or the arguments were not merge files. */ char *file_name(nr) register int nr; { if (only_merge) { if (args_offset + nr < args_limit) return argptr[args_offset + nr]; } temp_files[16] = nr / 26 + 'a'; temp_files[17] = nr % 26 + 'a'; return temp_files; } /* Mread () performs a normal read (), but checks the return value. */ void mread(fd, address, bytes) int fd; char *address; register int bytes; { if (read(fd, address, bytes) < 0 && bytes != 0) error(TRUE, "Read error", NIL_PTR); } /* Mwrite () performs a normal write (), but checks the return value. */ void mwrite(fd, address, bytes) int fd; char *address; register int bytes; { if (write(fd, address, bytes) != bytes && bytes != 0) error(TRUE, "Write error", NIL_PTR); } /* Sort () sorts the input in memory starting at mem_top. */ void sort() { register char *ptr = mem_top; register int count = 0; /* Count number of lines in memory */ while (*ptr) { if (*ptr++ == '\n') count++; } /* Set up the line table */ line_table = (char **) msbrk(count * sizeof(char *) + sizeof(char *)); count = 1; ptr = line_table[0] = mem_top; while (*ptr) { if (*ptr++ == '\n') line_table[count++] = ptr; } line_table[count - 1] = NIL_PTR; /* Sort the line table */ sort_table(count - 1); /* Stash output somewhere */ if (in_core) { open_outfile(); print_table(out_fd); } else print_table(ERROR); /* Free line table */ mbrk((char *) line_table); } /* Sort_table () sorts the line table consisting of nel elements. */ void sort_table(nel) register int nel; { char *tmp; register int i; /* Make heap */ for (i = (nel >> 1); i >= 1; i--) incr(i, nel); /* Sort from heap */ for (i = nel; i > 1; i--) { tmp = line_table[0]; line_table[0] = line_table[i - 1]; line_table[i - 1] = tmp; incr(1, i - 1); } } /* Incr () increments the heap. */ void incr(si, ei) register int si, ei; { char *tmp; while (si <= (ei >> 1)) { si <<= 1; if (si + 1 <= ei && compare(line_table[si - 1], line_table[si]) <= 0) si++; if (compare(line_table[(si >> 1) - 1], line_table[si - 1]) >= 0) return; tmp = line_table[(si >> 1) - 1]; line_table[(si >> 1) - 1] = line_table[si - 1]; line_table[si - 1] = tmp; } } /* Cmp_fields builds new lines out of the lines pointed to by el1 and el2 and * puts it into the line1 and line2 arrays. It then calls the cmp () routine * with the field describing the arguments. */ int cmp_fields(el1, el2) register char *el1, *el2; { int i, ret; char line1[LINE_SIZE], line2[LINE_SIZE]; for (i = 0; i < field_cnt; i++) { /* Setup line parts */ build_field(line1, &fields[i + 1], el1); build_field(line2, &fields[i + 1], el2); if ((ret = cmp((unsigned char *) line1, (unsigned char *) line2, &fields[i + 1])) != SAME) break; /* If equal, try next field */ } /* Check for reverse flag */ if (i != field_cnt && fields[i + 1].reverse) return -ret; /* Else return the last return value of cmp () */ return ret; } /* Build_field builds a new line from the src as described by the field. * The result is put in dest. */ void build_field(dest, field, src) char *dest; /* Holds result */ register FIELD *field; /* Field description */ register char *src; /* Source line */ { char *begin = src; /* Remember start location */ char *last; /* Pointer to end location */ int i; /* Skip begin fields */ src = skip_fields(src, field->beg_field); /* Skip begin positions */ for (i = 0; i < field->beg_pos && *src != '\n'; i++) src++; /* Copy whatever is left */ copy(dest, src); /* If end field is assigned truncate (perhaps) the part copied */ if (field->end_field != ERROR) { /* Find last field */ last = skip_fields(begin, field->end_field); /* Skip positions as given by end fields description */ for (i = 0; i < field->end_pos && *last != '\n'; i++) last++; dest[last - src] = '\n';/* Truncate line */ } } /* Skip_fields () skips nf fields of the line pointed to by str. */ char *skip_fields(str, nf) register char *str; int nf; { while (nf-- > 0) { if (separator == '\0') {/* Means ' ' or '\t' */ while (*str != ' ' && *str != '\t' && *str != '\n') str++; while (table[*str] & BLANK) str++; } else { while (*str != separator && *str != '\n') str++; if (*str == separator) str++; } } return str; /* Return pointer to indicated field */ } /* Compare is called by all sorting routines. It checks if fields assignments * has been made. if so, it calls cmp_fields (). If not, it calls cmp () and * reversed the return value if the (global) reverse flag is set. */ int compare(el1, el2) register char *el1, *el2; { int ret; if (field_cnt > GLOBAL) return cmp_fields(el1, el2); ret = cmp((unsigned char *) el1, (unsigned char *) el2, &fields[GLOBAL]); return(fields[GLOBAL].reverse) ? -ret : ret; } /* Cmp () is the actual compare routine. It compares according to the * description given in the field pointer. */ int cmp(el1, el2, field) register unsigned char *el1, *el2; FIELD *field; { int c1, c2; if (field->blanks) { /* Skip leading blanks */ while (table[*el1] & BLANK) el1++; while (table[*el2] & BLANK) el2++; } if (field->numeric) /* Compare numeric */ return digits((char *) el1, (char *) el2, TRUE); for (;;) { while (*el1 == *el2) { if (*el1++ == '\n') /* EOLN on both strings */ return SAME; el2++; } if (*el1 == '\n') /* EOLN on string one */ return LOWER; if (*el2 == '\n') return HIGHER; if (field->ascii) { /* Skip chars outside 040 - 0177 */ if ((table[*el1] & ASCII) == 0) { do { el1++; } while ((table[*el1] & ASCII) == 0); continue; } if ((table[*el2] & ASCII) == 0) { do { el2++; } while ((table[*el2] & ASCII) == 0); continue; } } if (field->dictionary) {/* Skip non-dict chars */ if ((table[*el1] & DICT) == 0) { do { el1++; } while ((table[*el1] & DICT) == 0); continue; } if ((table[*el2] & DICT) == 0) { do { el2++; } while ((table[*el2] & DICT) == 0); continue; } } if (field->fold_case) { /* Fold upper case to lower */ if (table[c1 = *el1++] & UPPER) c1 += 'a' - 'A'; if (table[c2 = *el2++] & UPPER) c2 += 'a' - 'A'; if (c1 == c2) continue; return c1 - c2; } return *el1 - *el2; } /* NOTREACHED */ } /* * Digits compares () the two strings that point to a number of digits followed * by an optional decimal point. */ int digits(str1, str2, check_sign) register char *str1, *str2; BOOL check_sign; /* True if sign must be checked */ { BOOL negative = FALSE; /* True if negative numbers */ int diff, pow, ret; /* Check for optional minus or plus sign */ if (check_sign) { if (*str1 == '-') { negative = TRUE; str1++; } else if (*str1 == '+') str1++; if (*str2 == '-') { if (negative == FALSE) return HIGHER; str2++; } else if (negative) return LOWER; else if (*str2 == '+') str2++; } /* Keep incrementing as long as digits are available and equal */ while ((table[*str1] & DIGIT) && table[*str2] & DIGIT) { if (*str1 != *str2) break; str1++; str2++; } /* First check for the decimal point. */ if (*str1 == '.' || *str2 == '.') { if (*str1 == '.') { if (*str2 == '.') /* Both. Check decimal part */ ret = digits(str1 + 1, str2 + 1, FALSE); else ret = (table[*str2] & DIGIT) ? LOWER : HIGHER; } else ret = (table[*str1] & DIGIT) ? HIGHER : LOWER; } /* Now either two digits differ, or unknown char is seen (e.g. end of string) */ else if ((table[*str1] & DIGIT) && (table[*str2] & DIGIT)) { diff = *str1 - *str2; /* Basic difference */ pow = 0; /* Check power of numbers */ while (table[*str1++] & DIGIT) pow++; while (table[*str2++] & DIGIT) pow--; ret = (pow == 0) ? diff : pow; } /* Unknown char. Check on which string it occurred */ else { if ((table[*str1] & DIGIT) == 0) ret = (table[*str2] & DIGIT) ? LOWER : SAME; else ret = HIGHER; } /* Reverse sense of comparisons if negative is true. (-1000 < -1) */ return(negative) ? -ret : ret; } /* Files_merge () merges all files as indicated by nr_of_files. Merging goes * in numbers of files that can be opened at the same time. (OPEN_FILES) */ void files_merge(file_cnt) register int file_cnt; /* Nr_of_files to merge */ { register int i; int limit; for (i = 0; i < file_cnt; i += OPEN_FILES) { /* Merge last files and store in output file */ if ((limit = i + OPEN_FILES) >= file_cnt) { open_outfile(); limit = file_cnt; } else { /* Merge OPEN_FILES files and store in temp * file */ temp_files[16] = file_cnt / 26 + 'a'; temp_files[17] = file_cnt % 26 + 'a'; if ((out_fd = creat(temp_files, 0644)) < 0) error(TRUE, "Cannot creat ", temp_files); file_cnt++; } merge(i, limit); } /* Cleanup mess */ i = (only_merge) ? args_limit - args_offset : 0; while (i < file_cnt) (void) unlink(file_name(i++)); } /* Merge () merges the files between start_file and limit_file. */ void merge(start_file, limit_file) int start_file, limit_file; { register MERGE *smallest; /* Keeps track of smallest line */ register int i; int file_cnt = limit_file - start_file; /* Nr of files to merge */ /* Calculate size in core available for file_cnt merge structs */ buf_size = MEMORY_SIZE / file_cnt - LINE_SIZE; mbrk(mem_top); /* First reset mem to lowest loc. */ disabled = 0; /* All files not done yet */ /* Set up merge structures. */ for (i = start_file; i < limit_file; i++) { smallest = &merge_f[i - start_file]; if (!strcmp(file_name(i), "-")) /* File is stdin */ smallest->fd = 0; else if ((smallest->fd = open(file_name(i), O_RDONLY)) < 0) { smallest->fd = ERROR; error(FALSE, "Cannot open ", file_name(i)); disabled++; /* Done this file */ continue; } smallest->buffer = msbrk(buf_size); smallest->line = msbrk(LINE_SIZE); smallest->cnt = smallest->read_chars = 0; (void) read_line(smallest); /* Read first line */ } if (disabled == file_cnt) { /* Couldn't open files */ (void) close(out_fd); return; } /* Find a merg struct to assign smallest. */ for (i = 0; i < file_cnt; i++) { if (merge_f[i].fd != ERROR) { smallest = &merge_f[i]; break; } } /* Loop until all files minus one are done */ while (disabled < file_cnt - 1) { if (uniq) /* Skip all same lines */ smallest = skip_lines(smallest, file_cnt); else { /* Find smallest line */ for (i = 0; i < file_cnt; i++) { if (merge_f[i].fd == ERROR) continue; /* We've had this one */ if (compare(merge_f[i].line, smallest->line) < 0) smallest = &merge_f[i]; } } /* Print line and read next */ smallest = print(smallest, file_cnt); } if (only_merge && uniq) uniq_lines(smallest); /* Print only uniq lines */ else /* Print rest of file */ while (print(smallest, file_cnt) != NIL_MERGE); put_line(NIL_PTR); /* Flush output buffer */ } /* Put_line () prints the line into the out_fd filedescriptor. If line equals * NIL_PTR, the out_fd is flushed and closed. */ void put_line(line) register char *line; { static int index = 0; /* Index in out_buffer */ if (line == NIL_PTR) { /* Flush and close */ mwrite(out_fd, out_buffer, index); index = 0; (void) close(out_fd); return; } do { /* Fill out_buffer with line */ out_buffer[index++] = *line; if (index == IO_SIZE) { mwrite(out_fd, out_buffer, IO_SIZE); index = 0; } } while (*line++ != '\n'); } /* * Print () prints the line of the merg structure and tries to read another one. * If this fails, it returns the next merg structure which file_descriptor is * still open. If none could be found, a NIL structure is returned. */ MERGE *print(merg, file_cnt) register MERGE *merg; int file_cnt; /* Nr of files that are being merged */ { register int i; put_line(merg->line); /* Print the line */ if (read_line(merg) == ERROR) { /* Read next line */ for (i = 0; i < file_cnt; i++) { if (merge_f[i].fd != ERROR) { merg = &merge_f[i]; break; } } if (i == file_cnt) /* No more files left */ return NIL_MERGE; } return merg; } /* Read_line () reads a line from the fd from the merg struct. If the read * failed, disabled is incremented and the file is closed. Readings are * done in buf_size bytes. * Lines longer than LINE_SIZE are silently truncated. */ int read_line(merg) register MERGE *merg; { register char *ptr = merg->line - 1; /* Ptr buf that will hold line */ do { ptr++; if (merg->cnt == merg->read_chars) { /* Read new buffer */ if ((merg->read_chars = read(merg->fd, merg->buffer, buf_size)) <= 0) { (void) close(merg->fd); /* OOPS */ merg->fd = ERROR; disabled++; return ERROR; } merg->cnt = 0; } *ptr = merg->buffer[merg->cnt++]; /* Assign next char of line */ if (ptr - merg->line == LINE_SIZE - 1) *ptr = '\n'; /* Truncate very long lines */ } while (*ptr != '\n' && *ptr != '\0'); if (*ptr == '\0') /* Add '\n' to last line */ *ptr = '\n'; *++ptr = '\0'; /* Add '\0' */ return OK; } /* Skip_lines () skips all same lines in all the files currently being merged. * It returns a pointer to the merge struct containing the smallest line. */ MERGE *skip_lines(smallest, file_cnt) register MERGE *smallest; int file_cnt; { register int i; int ret; if (disabled == file_cnt - 1) /* We've had all */ return smallest; for (i = 0; i < file_cnt; i++) { if (merge_f[i].fd == ERROR || smallest == &merge_f[i]) continue; /* Don't check same file */ while ((ret = compare(merge_f[i].line, smallest->line)) == 0) { if (read_line(&merge_f[i]) == ERROR) break; /* EOF */ } if (ret < 0) /* Line wasn't smallest. Try again */ return skip_lines(&merge_f[i], file_cnt); } return smallest; } /* Uniq_lines () prints only the uniq lines out of the fd of the merg struct. */ void uniq_lines(merg) register MERGE *merg; { char lastline[LINE_SIZE]; /* Buffer to hold last line */ for (;;) { put_line(merg->line); /* Print this line */ copy(lastline, merg->line); /* and save it */ if (read_line(merg) == ERROR) /* Read the next */ return; /* Keep reading until lines duffer */ while (compare(lastline, merg->line) == SAME) if (read_line(merg) == ERROR) return; } /* NOTREACHED */ } /* * Check_file () checks if a file is sorted in order according to the arguments * given in main (). */ void check_file(fd, file) int fd; char *file; { register MERGE *merg; /* 1 file only */ char lastline[LINE_SIZE]; /* Save last line */ register int ret; /* ret status of compare */ if (fd == 0) file = "stdin"; merg = (MERGE *) mem_top; /* Assign MERGE structure */ merg->buffer = mem_top + sizeof(MERGE); merg->line = msbrk(LINE_SIZE); merg->cnt = merg->read_chars = 0; merg->fd = fd; buf_size = MEMORY_SIZE - sizeof(MERGE); if (read_line(merg) == ERROR) /* Read first line */ return; copy(lastline, merg->line); /* and save it */ for (;;) { if (read_line(merg) == ERROR) /* EOF reached */ break; if ((ret = compare(lastline, merg->line)) > 0) { error(FALSE, "Disorder in file ", file); write(2, merg->line, length(merg->line)); break; } else if (ret < 0) /* Copy if lines not equal */ copy(lastline, merg->line); else if (uniq) { error(FALSE, "Non uniq line in file ", file); write(2, merg->line, length(merg->line)); break; } } mbrk(mem_top); /* Reset mem */ } /* Length () returns the length of the argument line including the linefeed. */ int length(line) register char *line; { register int i = 1; /* Add linefeed */ while (*line++ != '\n') i++; return i; } /* Copy () copies the src line into the dest line including linefeed. */ void copy(dest, src) register char *dest, *src; { while ((*dest++ = *src++) != '\n'); } /* Msbrk() does a sbrk() and checks the return value. */ char *msbrk(size) register int size; { register char *address; if ((address = sbrk(size)) == (char *) -1) error(TRUE, "Not enough memory. Use chmem to allocate more", NIL_PTR); return address; } /* Mbrk() does a brk() and checks the return value. */ void mbrk(address) char *address; { if (brk(address) == -1) error(TRUE, "Cannot reset memory", NIL_PTR); } void catch(dummy) int dummy; /* to satisfy the prototype */ { register int i; signal(SIGINT, SIG_IGN); only_merge = FALSE; for (i = 0; i < 26; i++) (void) unlink(file_name(i)); exit(2); }