1.\"	$OpenBSD: sort.1,v 1.21 2003/07/14 12:56:07 jmc Exp $
2.\"
3.\" Copyright (c) 1991, 1993
4.\"	The Regents of the University of California.  All rights reserved.
5.\"
6.\" This code is derived from software contributed to Berkeley by
7.\" the Institute of Electrical and Electronics Engineers, Inc.
8.\"
9.\" Redistribution and use in source and binary forms, with or without
10.\" modification, are permitted provided that the following conditions
11.\" are met:
12.\" 1. Redistributions of source code must retain the above copyright
13.\"    notice, this list of conditions and the following disclaimer.
14.\" 2. Redistributions in binary form must reproduce the above copyright
15.\"    notice, this list of conditions and the following disclaimer in the
16.\"    documentation and/or other materials provided with the distribution.
17.\" 3. Neither the name of the University nor the names of its contributors
18.\"    may be used to endorse or promote products derived from this software
19.\"    without specific prior written permission.
20.\"
21.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31.\" SUCH DAMAGE.
32.\"
33.\"     @(#)sort.1	8.1 (Berkeley) 6/6/93
34.\"
35.\"-
36.\" Copyright (c) 2008, 2009
37.\"	Thorsten “mirabilos” Glaser <tg@mirbsd.org>
38.\"-
39.\" Try to make GNU groff and AT&T nroff more compatible
40.\" * ` generates ‘ in gnroff, so use \`
41.\" * ' generates ’ in gnroff, \' generates ´, so use \*(aq
42.\" * - generates ‐ in gnroff, \- generates −, so .tr it to -
43.\"   thus use - for hyphens and \- for minus signs and option dashes
44.\" * ~ is size-reduced and placed atop in groff, so use \*(TI
45.\" * ^ is size-reduced and placed atop in groff, so use \*(ha
46.\" * \(en does not work in nroff, so use \*(en
47.\" The section after the "doc" macropackage has been loaded contains
48.\" additional code to convene between the UCB mdoc macropackage (and
49.\" its variant as BSD mdoc in groff) and the GNU mdoc macropackage.
50.\"
51.ie \n(.g \{\
52.	if \*[.T]ascii .tr \-\N'45'
53.	if \*[.T]latin1 .tr \-\N'45'
54.	if \*[.T]utf8 .tr \-\N'45'
55.	ds <= \[<=]
56.	ds >= \[>=]
57.	ds Rq \[rq]
58.	ds Lq \[lq]
59.	ds sL \(aq
60.	ds sR \(aq
61.	if \*[.T]utf8 .ds sL `
62.	if \*[.T]ps .ds sL `
63.	if \*[.T]utf8 .ds sR '
64.	if \*[.T]ps .ds sR '
65.	ds aq \(aq
66.	ds TI \(ti
67.	ds ha \(ha
68.	ds en \(en
69.\}
70.el \{\
71.	ds aq '
72.	ds TI ~
73.	ds ha ^
74.	ds en \(em
75.\}
76.\"
77.\" Implement .Dd with the Mdocdate RCS keyword
78.\"
79.rn Dd xD
80.de Dd
81.ie \\$1$Mdocdate: \{\
82.	xD \\$2 \\$3, \\$4
83.\}
84.el .xD \\$1 \\$2 \\$3 \\$4 \\$5 \\$6 \\$7 \\$8
85..
86.\"
87.\" .Dd must come before definition of .Mx, because when called
88.\" with -mandoc, it might implement .Mx itself, but we want to
89.\" use our own definition. And .Dd must come *first*, always.
90.\"
91.Dd $Mdocdate: November 22 2009 $
92.\"
93.\" Check which macro package we use
94.\"
95.ie \n(.g \{\
96.	ie d volume-ds-1 .ds tT gnu
97.	el .ds tT bsd
98.\}
99.el .ds tT ucb
100.\"
101.\" Implement .Mx (MirBSD)
102.\"
103.ie "\*(tT"gnu" \{\
104.	eo
105.	de Mx
106.	nr curr-font \n[.f]
107.	nr curr-size \n[.ps]
108.	ds str-Mx \f[\n[curr-font]]\s[\n[curr-size]u]
109.	ds str-Mx1 \*[Tn-font-size]\%MirOS\*[str-Mx]
110.	if !\n[arg-limit] \
111.	if \n[.$] \{\
112.	ds macro-name Mx
113.	parse-args \$@
114.	\}
115.	if (\n[arg-limit] > \n[arg-ptr]) \{\
116.	nr arg-ptr +1
117.	ie (\n[type\n[arg-ptr]] == 2) \
118.	as str-Mx1 \~\*[arg\n[arg-ptr]]
119.	el \
120.	nr arg-ptr -1
121.	\}
122.	ds arg\n[arg-ptr] "\*[str-Mx1]
123.	nr type\n[arg-ptr] 2
124.	ds space\n[arg-ptr] "\*[space]
125.	nr num-args (\n[arg-limit] - \n[arg-ptr])
126.	nr arg-limit \n[arg-ptr]
127.	if \n[num-args] \
128.	parse-space-vector
129.	print-recursive
130..
131.	ec
132.	ds sP \s0
133.	ds tN \*[Tn-font-size]
134.\}
135.el \{\
136.	de Mx
137.	nr cF \\n(.f
138.	nr cZ \\n(.s
139.	ds aa \&\f\\n(cF\s\\n(cZ
140.	if \\n(aC==0 \{\
141.		ie \\n(.$==0 \&MirOS\\*(aa
142.		el .aV \\$1 \\$2 \\$3 \\$4 \\$5 \\$6 \\$7 \\$8 \\$9
143.	\}
144.	if \\n(aC>\\n(aP \{\
145.		nr aP \\n(aP+1
146.		ie \\n(C\\n(aP==2 \{\
147.			as b1 \&MirOS\ #\&\\*(A\\n(aP\\*(aa
148.			ie \\n(aC>\\n(aP \{\
149.				nr aP \\n(aP+1
150.				nR
151.			\}
152.			el .aZ
153.		\}
154.		el \{\
155.			as b1 \&MirOS\\*(aa
156.			nR
157.		\}
158.	\}
159..
160.\}
161.\"-
162.Dt SORT 1
163.Os
164.Sh NAME
165.Nm sort
166.Nd sort or merge text files
167.Sh SYNOPSIS
168.Nm sort
169.Op Fl cmubdfinrH
170.Op Fl t Ar char
171.Op Fl R Ar char
172.Oo
173.Fl k Ar field1[,field2]
174.Oc
175.Ar ...
176.Op Fl T Ar dir
177.Op Fl o Ar output
178.Op Ar file
179.Ar ...
180.Sh DESCRIPTION
181The
182.Nm
183utility sorts text files by lines.
184Comparisons are based on one or more sort keys extracted
185from each line of input, and are performed lexicographically.
186By default, if keys are not given,
187.Nm
188regards each input line as a single field.
189.Pp
190The options are as follows:
191.Bl -tag -width Ds
192.It Fl c
193Check that the single input file is sorted.
194If the file is not sorted,
195.Nm
196produces the appropriate error messages and exits with code 1; otherwise,
197.Nm
198returns 0.
199.Nm
200.Fl c
201produces no output, except the error messages on
202.Em stderr .
203.It Fl m
204Merge only; the input files are assumed to be pre-sorted.
205.It Fl o Ar output
206The argument given is the name of an
207.Ar output
208file to be used instead of the standard output.
209This file can be the same as one of the input files.
210.It Fl T Ar dir
211Use
212.Ar dir
213as the directory for temporary files.
214The default is the contents of the environment variable
215.Ev TMPDIR
216or
217.Pa /var/tmp
218if
219.Ev TMPDIR
220does not exist.
221.It Fl u
222Unique: suppress all but one in each set of lines having equal keys.
223If used with the
224.Fl c
225option, check that there are no lines with duplicate keys.
226.El
227.Pp
228The following options override the default ordering rules.
229When ordering options appear independent of key field
230specifications, the requested field ordering rules are
231applied globally to all sort keys.
232When attached to a specific key (see
233.Fl k ) ,
234the ordering options override
235all global ordering options for that key.
236.Bl -tag -width indent
237.It Fl d
238Only blank space and alphanumeric characters
239.\" according
240.\" to the current setting of LC_CTYPE
241are used in making comparisons.
242.It Fl f
243Considers all lowercase characters that have uppercase
244equivalents to be the same for purposes of comparison.
245.It Fl i
246Ignore all non-printable characters.
247.It Fl n
248An initial numeric string, consisting of optional blank space, optional
249minus sign, and zero or more digits (including decimal point)
250.\" with
251.\" optional radix character and thousands
252.\" separator
253.\" (as defined in the current locale),
254is sorted by arithmetic value.
255(The
256.Fl n
257option no longer implies the
258.Fl b
259option.)
260.It Fl r
261Reverse the sense of comparisons.
262.It Fl H
263Use a merge sort instead of a radix sort.
264This option should be used for files larger than 60Mb.
265.El
266.Pp
267The treatment of field separators can be altered using these options:
268.Bl -tag -width indent
269.It Fl b
270Ignores leading blank space when determining the start
271and end of a restricted sort key.
272A
273.Fl b
274option specified before the first
275.Fl k
276option applies globally to all
277.Fl k
278options.
279Otherwise, the
280.Fl b
281option can be attached independently to each
282.Ar field
283argument of the
284.Fl k
285option (see below).
286Note that the
287.Fl b
288option has no effect unless key fields are specified.
289.It Fl t Ar char
290.Ar char
291is used as the field separator character.
292The initial
293.Ar char
294is not considered to be part of a field when determining key offsets.
295Each occurrence of
296.Ar char
297is significant (for example,
298.Dq Ar charchar
299delimits an empty field).
300If
301.Fl t
302is not specified, the default field separator is a sequence of
303blank-space characters, and consecutive blank spaces do
304.Em not
305delimit an empty field; further, the initial blank space
306.Em is
307considered part of a field when determining key offsets.
308.It Fl R Ar char
309.Ar char
310is used as the record separator character.
311This should be used with discretion;
312.Fl R Ar <alphanumeric>
313usually produces undesirable results.
314The default record separator is newline.
315.It Fl k Ar field1[,field2]
316Designates the starting position,
317.Ar field1 ,
318and optional ending position,
319.Ar field2 ,
320of a key field.
321The
322.Fl k
323option replaces the obsolescent options
324.Cm \(pl Ns Ar pos1
325and
326.Fl Ns Ar pos2 .
327.El
328.Pp
329The following operands are available:
330.Bl -tag -width indent
331.It Ar file
332The pathname of a file to be sorted, merged, or checked.
333If no
334.Ar file
335operands are specified, or if a
336.Ar file
337operand is
338.Fl ,
339the standard input is used.
340.El
341.Pp
342A field is defined as a maximal sequence of characters other than the
343field separator and record separator
344.Pq newline by default .
345Initial blank spaces are included in the field unless
346.Fl b
347has been specified;
348the first blank space of a sequence of blank spaces acts as the field
349separator and is included in the field (unless
350.Fl t
351is specified).
352For example, by default all blank spaces at the beginning of a line are
353considered to be part of the first field.
354.Pp
355Fields are specified by the
356.Fl k Ar field1[,field2]
357argument.
358A missing
359.Ar field2
360argument defaults to the end of a line.
361.Pp
362The arguments
363.Ar field1
364and
365.Ar field2
366have the form
367.Em m.n
368.Em (m,n \*(Gt 0)
369and can be followed by one or more of the letters
370.Cm b , d , f , i ,
371.Cm n ,
372and
373.Cm r ,
374which correspond to the options discussed above.
375A
376.Ar field1
377position specified by
378.Em m.n
379is interpreted as the
380.Em n Ns th
381character from the beginning of the
382.Em m Ns th
383field.
384A missing
385.Em \&.n
386in
387.Ar field1
388means
389.Ql \&.1 ,
390indicating the first character of the
391.Em m Ns th
392field; if the
393.Fl b
394option is in effect,
395.Em n
396is counted from the first non-blank character in the
397.Em m Ns th
398field;
399.Em m Ns \&.1b
400refers to the first non-blank character in the
401.Em m Ns th
402field.
403.No 1\&. Ns Em n
404refers to the
405.Em n Ns th
406character from the beginning of the line;
407if
408.Em n
409is greater than the length of the line, the field is taken to be empty.
410.Pp
411A
412.Ar field2
413position specified by
414.Em m.n
415is interpreted as the
416.Em n Ns th
417character (including separators) of the
418.Em m Ns th
419field.
420A missing
421.Em \&.n
422indicates the last character of the
423.Em m Ns th
424field;
425.Em m
426= \&0
427designates the end of a line.
428Thus the option
429.Fl k Ar v.x,w.y
430is synonymous with the obsolescent option
431.Cm \(pl Ns Ar v\-\&1.x\-\&1
432.Fl Ns Ar w\-\&1.y ;
433when
434.Em y
435is omitted,
436.Fl k Ar v.x,w
437is synonymous with
438.Cm \(pl Ns Ar v\-\&1.x\-\&1
439.Fl Ns Ar w\&.0 .
440The obsolescent
441.Cm \(pl Ns Ar pos1
442.Fl Ns Ar pos2
443option is still supported, except for
444.Fl Ns Ar w\&.0b ,
445which has no
446.Fl k
447equivalent.
448.Pp
449The
450.Nm
451utility shall exit with one of the following values:
452.Pp
453.Bl -tag -width flag -compact
454.It 0
455Normal behavior.
456.It 1
457On disorder (or non-uniqueness) with the
458.Fl c
459option.
460.It 2
461An error occurred.
462.El
463.Sh ENVIRONMENT
464.Bl -tag -width Fl
465.It Ev TMPDIR
466Path in which to store temporary files.
467Note that
468.Ev TMPDIR
469may be overridden by the
470.Fl T
471option.
472.El
473.Sh FILES
474.Bl -tag -width Pa -compact
475.It Pa /var/tmp/sort.*
476default temporary directories
477.It Pa Ar output Ns #PID
478temporary name for
479.Ar output
480if
481.Ar output
482already exists
483.El
484.Sh SEE ALSO
485.Xr comm 1 ,
486.Xr join 1 ,
487.Xr uniq 1 ,
488.Xr radixsort 3
489.Sh HISTORY
490A
491.Nm
492command appeared in
493.At v3 .
494.Sh NOTES
495.Nm
496has no limits on input line length (other than imposed by available
497memory) or any restrictions on bytes allowed within lines.
498.Pp
499To protect data
500.Nm
501.Fl o
502calls
503.Xr link 2
504and
505.Xr unlink 2 ,
506and thus fails on protected directories.
507.Pp
508The current sort command uses lexicographic radix sorting, which requires
509that sort keys be kept in memory (as opposed to previous versions which
510used quick and merge sorts and did not).
511Thus performance depends highly on efficient choice of sort keys, and the
512.Fl b
513option and the
514.Ar field2
515argument of the
516.Fl k
517option should be used whenever possible.
518Similarly,
519.Nm
520.Fl k1f
521is equivalent to
522.Nm
523.Fl f
524and may take twice as long.
525.Sh BUGS
526To sort files larger than 60Mb, use
527.Nm
528.Fl H ;
529files larger than 704Mb must be sorted in smaller pieces, then merged.
530