1package Porting::updateAUTHORS;
2use strict;
3use warnings;
4use Data::Dumper;
5use Encode qw(encode_utf8 decode_utf8 decode);
6use Digest::SHA qw(sha256_base64);
7use Text::Wrap qw(wrap);
8use Unicode::Collate;
9use Cwd qw(getcwd);
10use feature 'fc';
11$Text::Wrap::columns= 80;
12
13# The style of this file is determined by:
14#
15# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \
16#   -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs  \
17#   -fsb='#start-no-tidy' -fse='#end-no-tidy'
18
19# Info and config for passing to git log.
20#   %an: author name
21#   %aN: author name (respecting .mailmap, see git-shortlog(1) or git-blame(1))
22#   %ae: author email
23#   %aE: author email (respecting .mailmap, see git-shortlog(1) or git-blame(1))
24#   %cn: committer name
25#   %cN: committer name (respecting .mailmap, see git-shortlog(1) or git-blame(1))
26#   %ce: committer email
27#   %cE: committer email (respecting .mailmap, see git-shortlog(1) or git-blame(1))
28#   %H: commit hash
29#   %h: abbreviated commit hash
30#   %s: subject
31#   %x00: print a byte from a hex code
32
33my %field_spec= (
34    "an" => "author_name",
35    "aN" => "author_name_mm",
36    "ae" => "author_email",
37    "aE" => "author_email_mm",
38    "cn" => "committer_name",
39    "cN" => "committer_name_mm",
40    "ce" => "committer_email",
41    "cE" => "committer_email_mm",
42    "H"  => "commit_hash",
43    "h"  => "abbrev_hash",
44    "s"  => "commit_subject",
45);
46
47my $Collate= Unicode::Collate->new(level => 1, indentical => 1);
48my @field_codes= sort keys %field_spec;
49my @field_names= map { $field_spec{$_} } @field_codes;
50my $tformat= "=" . join "%x09", map { "%" . $_ } @field_codes;
51
52sub _make_name_author_info {
53    my ($self, $commit_info, $name_key)= @_;
54    my $author_info= $self->{author_info};
55    (my $email_key= $name_key) =~ s/name/email/;
56    my $email= $commit_info->{$email_key};
57    my $name= $commit_info->{$name_key};
58
59    my $line= $author_info->{"email2line"}{$email}
60        // $author_info->{"name2line"}{ lc($name) };
61
62    $line //= sprintf "%-31s<%s>",
63        $commit_info->{$name_key}, $commit_info->{$email_key};
64    $commit_info->{ $name_key . "_canon" }= $line;
65    return $line;
66}
67
68sub _make_name_simple {
69    my ($self, $commit_info, $key)= @_;
70    my $name_key= $key . "_name";
71    my $email_key= $key . "_email";
72    return sprintf "%s <%s>", $commit_info->{$name_key},
73        lc($commit_info->{$email_key});
74}
75
76sub __fold_trim_ws {
77    my ($munged)= @_;
78    $munged =~ s/\s+/ /g;
79    $munged =~ s/\A\s+//;
80    $munged =~ s/\s+\z//;
81    return $munged;
82}
83
84sub _register_author {
85    my ($self, $name, $type)= @_;
86
87    return if $self->_logical_exclude_author($name);
88
89    my $digest= $self->_keeper_digest($name)
90        or return;
91
92    $self->{who_stats}{$name}{$type}++;
93
94    $self->{author_info}{"lines"}{$name}
95        and return;
96
97    my $munged= __fold_trim_ws($name);
98    if ($self->{exclude_missing}) {
99        $self->_exclude_contrib($name, $digest);
100    }
101    else {
102        $self->{author_info}{"lines"}{$name}++;
103
104        my $munged= __fold_trim_ws($name);
105        warn encode_utf8 sprintf
106            "New %s '%s' (%s) will be added to AUTHORS file.\n",
107            $type, $munged, $digest
108            if $self->{verbose};
109    }
110}
111
112sub git_conf_get {
113    my ($self, $setting)= @_;
114    chomp(my $value= `git config --get $setting`);
115    return decode_utf8 $value;
116}
117
118sub current_git_user_name {
119    my ($self)= @_;
120    return $self->git_conf_get("user.name");
121}
122
123sub current_git_user_email {
124    my ($self)= @_;
125    return $self->git_conf_get("user.email");
126}
127
128sub current_git_name_email {
129    my ($self, $type)= @_;
130    my $name=
131           $ENV{"GIT_\U$type\E_NAME"}
132        || $self->git_conf_get("\L$type\E.name")
133        || $self->current_git_user_name();
134    my $email=
135           $ENV{"GIT_\U$type\E_EMAIL"}
136        || $self->git_conf_get("\L$type\E.email")
137        || $self->current_git_user_email();
138    return $name, $email;
139}
140
141sub format_name_email {
142    my ($self, $name, $email)= @_;
143    return sprintf "%s <%s>", $name, $email;
144}
145
146sub current_committer_name_email {
147    my ($self, $full)= @_;
148    my ($n,    $e)= $self->current_git_name_email("committer");
149    return $full ? $self->format_name_email($n, $e) : ($n, $e);
150}
151
152sub current_author_name_email {
153    my ($self, $full)= @_;
154    my ($n,    $e)= $self->current_git_name_email("author");
155    return $full ? $self->format_name_email($n, $e) : ($n, $e);
156}
157
158sub git_status_porcelain {
159    my ($self)= @_;
160    my $status= `git status --porcelain`;
161    return $status // "";
162}
163
164sub finalize_commit_info {
165    my ($self, $commit_info)= @_;
166    my $author= $commit_info->{author_name_mm_canon};
167    my $author_stats= $self->{who_stats}{$author} ||= {};
168
169    my $file_info= $commit_info->{files} ||= {};
170    foreach my $file (keys %{$file_info}) {
171        if (!$self->{file_stats}{$file}) {
172            $self->{summary_stats}{num_files}++;
173        }
174        my $fs= $self->{file_stats}{$file}          ||= {};
175        my $afs= $author_stats->{file_stats}{$file} ||= {};
176        my $added= $file_info->{$file}{lines_added};
177        my $removed= $file_info->{$file}{lines_removed};
178        my $delta= $file_info->{$file}{lines_delta};
179        defined $_ and $_ eq "-" and undef $_ for $added, $removed;
180
181        if (defined $added) {
182            for my $h ($author_stats, $fs, $afs) {
183                $h->{lines_delta}   += $delta;
184                $h->{lines_added}   += $added;
185                $h->{lines_removed} += $removed;
186            }
187        }
188        else {
189            $author_stats->{binary_change}++;
190            $fs->{binary_change}++;
191            $afs->{binary_change}++;
192        }
193        $afs->{commits}++
194            or $author_stats->{num_files}++;
195
196        $fs->{commits}++
197            or $self->{summary_stats}{num_files}++;
198
199        $fs->{who}{$author}++
200            or $self->{summary_stats}{authors}++;
201    }
202}
203
204sub read_commit_log {
205    my ($self)= @_;
206    my $author_info= $self->{author_info}   ||= {};
207    my $mailmap_info= $self->{mailmap_info} ||= {};
208
209    my $commits_read= 0;
210    my @args= ("--pretty='format:$tformat'");
211    push @args, "--numstat" if $self->{numstat};
212    push @args, "'$self->{commit_range}'" if $self->{commit_range};
213
214    my $last_commit_info;
215    my $cmd= qq(git -c diff.algorithm=myers log @args);
216    $cmd =~ s/'/"/g if $^O =~ /Win/;
217
218    # If we run under -Dmksymlinks git might not have access to the
219    # .mailmap unless we chdir into the git working tree. Ideally we
220    # would prefer to pass in the location of the .mailmap file to git,
221    # but it doesn't support that as a command line option. We can't
222    # just chdir into the source_dir as that would break scalar PerlIO
223    # layer operations which are loaded late. So we chdir before we
224    # read the git log, and then chdir right back after we have opened
225    # the handle. Note there is related code in read_mailmap_file(),
226    # if you change this also change that.
227
228    my $cwd = getcwd();
229    if ($self->{source_dir}) {
230        chdir $self->{source_dir};
231    }
232
233    open my $fh, "-|", $cmd
234        or die "Failed to open git log pipe: $!";
235
236    chdir $cwd;
237
238    binmode($fh);
239    while (defined(my $line= <$fh>)) {
240        chomp $line;
241        $line= decode_utf8($line);
242        if ($line =~ s/^=//) {
243            $self->finalize_commit_info($last_commit_info)
244                if $last_commit_info;
245        }
246        elsif ($line =~ /\S/) {
247            my ($added, $removed, $file)= split /\s+/, $line;
248            if ($added ne "-") {
249                $last_commit_info->{files}{$file}= {
250                    lines_added   => $added,
251                    lines_removed => $removed,
252                    lines_delta   => $added - $removed,
253                };
254            }
255            else {
256                $last_commit_info->{files}{$file}{binary_changes}++;
257            }
258            next;
259        }
260        else {
261            # whitspace only or empty line
262            next;
263        }
264        $commits_read++;
265        my $commit_info= {};
266        $last_commit_info= $commit_info;
267        @{$commit_info}{@field_names}= split /\t/, $line, 0 + @field_names;
268
269        my $author_name_mm_canon=
270            $self->_make_name_author_info($commit_info, "author_name_mm");
271
272        my $committer_name_mm_canon=
273            $self->_make_name_author_info($commit_info, "committer_name_mm");
274
275        my $author_name_real= $self->_make_name_simple($commit_info, "author");
276
277        my $committer_name_real=
278            $self->_make_name_simple($commit_info, "committer");
279
280        my ($author_good, $committer_good);
281
282        if (   $self->_keeper_digest($author_name_mm_canon)
283            && $self->_keeper_digest($author_name_real))
284        {
285            $self->_check_name_mailmap($author_name_mm_canon, $author_name_real,
286                $commit_info, "author name");
287            $self->_register_author($author_name_mm_canon, "author");
288            $author_good= 1;
289        }
290
291        if (   $self->_keeper_digest($committer_name_mm_canon)
292            && $self->_keeper_digest($committer_name_real))
293        {
294            $self->_check_name_mailmap($committer_name_mm_canon,
295                $committer_name_real, $commit_info, "committer name");
296            $self->_register_author($committer_name_mm_canon, "committer");
297            $committer_good= 1;
298        }
299        if (    $author_good
300            and $committer_good
301            and $committer_name_mm_canon ne $author_name_mm_canon)
302        {
303            $self->{who_stats}{$committer_name_mm_canon}{applied}++;
304        }
305    }
306    $self->finalize_commit_info($last_commit_info) if $last_commit_info;
307    if (!$commits_read) {
308        if ($self->{commit_range}) {
309            die "No commits in range '$self->{commit_range}'\n";
310        }
311        else {
312            die "Panic! There are no commits!\n";
313        }
314    }
315    return $author_info;
316}
317
318sub dupe_info {
319    my ($self)= @_;
320    my $msg= "";
321    foreach my $type (sort keys %{ $self->{dupe} || {} }) {
322        $msg .= "Duplicate \u$type in $self->{authors_file}:\n";
323        foreach my $key (sort keys %{ $self->{dupe}{$type} }) {
324            $msg .= "  \u$type '$key'\n";
325            foreach my $line (sort keys %{ $self->{dupe}{$type}{$key} }) {
326                $msg .= "    $line\n";
327            }
328        }
329    }
330    return $msg;
331}
332
333sub read_authors_file {
334    my ($self)= @_;
335    my $authors_file= $self->{authors_file};
336
337    my @authors_preamble;
338    open my $in_fh, "<", $authors_file
339        or die "Failed to open for read '$authors_file': $!";
340    my $raw_text= "";
341    my $found_sep= 0;
342    while (defined(my $line= <$in_fh>)) {
343        $raw_text .= $line;
344        $line= decode_utf8($line);
345        chomp $line;
346        push @authors_preamble, $line;
347        if ($line =~ /^--/) {
348            $found_sep= 1;
349            last;
350        }
351    }
352    if (!$found_sep) {
353        die sprintf <<'EOFMT', $authors_file;
354Possibly corrupted authors file '%s'.
355
356There should be a big '#' comment block at the start of the file
357followed by "--" followed by a list of names and email/contact
358details. We couldn't find the separator. Where did it go?
359
360Cowardly refusing to continue until this is fixed.
361EOFMT
362    }
363    my %author_info;
364    while (defined(my $line= <$in_fh>)) {
365        $raw_text .= $line;
366        $line= decode_utf8($line);
367        chomp $line;
368        my ($name, $email);
369        my $copy= $line;
370        $copy =~ s/\s+\z//;
371        if ($copy =~ s/<([^<>]*)>//) {
372            $email= $1;
373        }
374        elsif ($copy =~ s/\s+(\@\w+)\z//) {
375            $email= $1;
376        }
377        $copy =~ s/\s+\z//;
378        $name= $copy;
379        $email //= "unknown";
380        my $orig_name= $name;
381        my $orig_email= $email;
382        if (my $new_name= $self->{change_name_for_name}{$orig_name}) {
383            $name= $new_name;
384        }
385        if (my $new_name= $self->{change_name_for_email}{$orig_email}) {
386            $name= $new_name;
387        }
388        if (my $new_email= $self->{change_email_for_name}{$orig_name}) {
389            $email= $new_email;
390        }
391        if (my $new_email= $self->{change_email_for_email}{$orig_email}) {
392            $email= $new_email;
393        }
394        $line= sprintf "%-31s%s", $name, $email =~ /^\@/ ? $email : "<$email>";
395        $line =~ s/\s+<unknown>\z//;
396        $email= lc($email);
397
398        $line =~ s/\s+\z//;
399        $author_info{"lines"}{$line}++;
400        if ($email and $email ne "unknown") {
401            if (my $other= $author_info{"email2line"}{$email}) {
402                $self->{dupe}{email}{$email}{$other}= 1;
403                $self->{dupe}{email}{$email}{$line}= 1;
404            }
405            else {
406                $author_info{"email2line"}{$email}= $line;
407            }
408        }
409        if ($name and $name ne "unknown") {
410            if (my $other= $author_info{"name2line"}{ lc($name) }) {
411                $self->{dupe}{name}{$name}{$other}= 1;
412                $self->{dupe}{name}{$name}{$line}= 1;
413            }
414            else {
415                $author_info{"name2line"}{ lc($name) }= $line;
416            }
417        }
418        $author_info{"email2name"}{$email} //= $name
419            if $email
420            and $name
421            and $email ne "unknown";
422        $author_info{"name2email"}{$name} //= $email
423            if $name and $name ne "unknown";
424        $author_info{"clean_full"}{ __fold_trim_ws($line) }= $line;
425    }
426    close $in_fh
427        or die "Failed to close '$authors_file': $!";
428
429    $self->{author_info}= \%author_info;
430    $self->{authors_preamble}= \@authors_preamble;
431    $self->{authors_raw_text}= $raw_text;
432    return (\%author_info, \@authors_preamble, $raw_text);
433}
434
435sub update_authors_file {
436    my ($self)= @_;
437
438    my $author_info= $self->{author_info};
439    my $authors_preamble= $self->{authors_preamble};
440    my $authors_file= $self->{authors_file};
441    my $old_raw_text= $self->{authors_raw_text};
442
443    my $authors_file_new= $authors_file . ".new";
444    my $new_raw_text= "";
445    {
446        open my $out_fh, ">", \$new_raw_text
447            or die "Failed to open scalar buffer for write: $!";
448        foreach my $line (@$authors_preamble) {
449            print $out_fh encode_utf8($line), "\n"
450                or die "Failed to print to scalar buffer handle: $!";
451        }
452        foreach my $author (__sorted_hash_keys($author_info->{"lines"})) {
453            next if $self->_logical_exclude_author($author);
454            my $author_mm= $self->_author_to_mailmap($author);
455            if (!$self->_keeper_digest($author_mm)) {
456                next;
457            }
458            print $out_fh encode_utf8($author), "\n"
459                or die "Failed to print to scalar buffer handle: $!";
460        }
461        close $out_fh
462            or die "Failed to close scalar buffer handle: $!";
463    }
464    if ($new_raw_text ne $old_raw_text) {
465        $self->{changed_count}++;
466        $self->_log_file_changes_quick_and_dirty_diff($authors_file,
467            $old_raw_text, $new_raw_text);
468
469        if ($self->{no_update}) {
470            return 1;
471        }
472
473        warn "Updating '$authors_file'\n" if $self->{verbose};
474
475        open my $out_fh, ">", $authors_file_new
476            or die "Failed to open for write '$authors_file_new': $!";
477        binmode $out_fh;
478        print $out_fh $new_raw_text;
479        close $out_fh
480            or die "Failed to close '$authors_file_new': $!";
481        rename $authors_file_new, $authors_file
482            or die
483            "Failed to rename '$authors_file_new' to '$authors_file': $!";
484        return 1;
485    }
486    else {
487        return 0;
488    }
489}
490
491sub read_mailmap_file {
492    my ($self)= @_;
493    my $mailmap_file= $self->{mailmap_file};
494
495    # If we run under -Dmksymlinks the .mailmap might actually be
496    # located in a different directory than the one we are running from.
497    # We could munge the $mailmap_file to be relative to source_dir if
498    # it was not already an absolute path, but that would require
499    # loading File::Spec, and doing THAT wouldn't help us when we read
500    # the git log (see the related code for doing that in the
501    # read_commit_log() sub), so we use the same strategy of remembering
502    # our current working directory, chdir'ing into the source dir,
503    # opening the file, and then chdir'ing back in both cases.
504
505    my $cwd = getcwd();
506    if ($self->{source_dir}) {
507        chdir $self->{source_dir};
508    }
509    open my $in, "<", $mailmap_file
510        or die "Failed to read '$mailmap_file': $!";
511    chdir $cwd;
512    my %mailmap_hash;
513    my @mailmap_preamble;
514    my $line_num= 0;
515    my $raw_text= "";
516    while (defined(my $line= <$in>)) {
517        $raw_text .= $line;
518        $line= decode_utf8($line);
519        ++$line_num;
520        next unless $line =~ /\S/;
521        chomp($line);
522        if ($line =~ /^#/) {
523            if (!keys %mailmap_hash) {
524                push @mailmap_preamble, $line;
525            }
526            else {
527                die encode_utf8 "Not expecting comments after header ",
528                    "finished at line $line_num!\nLine: $line\n";
529            }
530        }
531        else {
532            $mailmap_hash{$line}= $line_num;
533        }
534    }
535    close $in
536        or die "Failed to close '$mailmap_file' after reading: $!";
537    if (!@mailmap_preamble) {
538        die sprintf <<'EOFMT', $mailmap_file;
539Possibly corrupted mailmap file '%s'.
540
541This file should have a preamble of '#' comments in it.
542
543Where did they go?
544
545Cowardly refusing to continue until this is fixed.
546EOFMT
547    }
548    $self->{orig_mailmap_hash}= \%mailmap_hash;
549    $self->{mailmap_preamble}= \@mailmap_preamble;
550    $self->{mailmap_raw_text}= $raw_text;
551    return (\%mailmap_hash, \@mailmap_preamble, $raw_text);
552}
553
554sub __sorted_hash_keys {
555    my ($hash)= @_;
556    return __sort_names(keys %$hash);
557}
558
559sub __sort_names {
560    my @sorted= sort { fc($a) cmp fc($b) || $a cmp $b } @_;
561    return @sorted;
562}
563
564# Returns 0 if the file needed to be changed, Return 1 if it does not.
565sub update_mailmap_file {
566    my ($self)= @_;
567    my $mailmap_hash= $self->{new_mailmap_hash};
568    my $mailmap_preamble= $self->{mailmap_preamble};
569    my $mailmap_file= $self->{mailmap_file};
570    my $old_raw_text= $self->{mailmap_raw_text};
571
572    my $new_raw_text= "";
573    {
574        my $cwd = getcwd();
575        open my $out, ">", \$new_raw_text
576            or die "Failed to open scalar buffer for write: $!";
577        foreach
578            my $line (@$mailmap_preamble, __sorted_hash_keys($mailmap_hash),)
579        {
580            next if $line =~ m!\A(.*) \1\z!;
581            print $out encode_utf8($line), "\n"
582                or die "Failed to print to scalar buffer handle: $!";
583        }
584        close $out
585            or die "Failed to close scalar buffer handle: $!";
586    }
587    if ($new_raw_text ne $old_raw_text) {
588        $self->{changed_count}++;
589        $self->_log_file_changes_quick_and_dirty_diff($mailmap_file,
590            $old_raw_text, $new_raw_text);
591
592        if ($self->{no_update}) {
593            return 1;
594        }
595
596        warn "Updating '$mailmap_file'\n"
597            if $self->{verbose};
598
599        my $mailmap_file_new= $mailmap_file . ".new";
600        open my $out, ">", $mailmap_file_new
601            or die "Failed to write '$mailmap_file_new': $!";
602        binmode $out
603            or die "Failed to binmode '$mailmap_file_new': $!";
604        print $out $new_raw_text
605            or die "Failed to print to '$mailmap_file_new': $!";
606        close $out
607            or die "Failed to close '$mailmap_file_new' after writing: $!";
608        rename $mailmap_file_new, $mailmap_file
609            or die
610            "Failed to rename '$mailmap_file_new' to '$mailmap_file': $!";
611        return 1;
612    }
613    else {
614        return 0;
615    }
616}
617
618sub parse_orig_mailmap_hash {
619    my ($self)= @_;
620    my $mailmap_hash= $self->{orig_mailmap_hash};
621
622    my @recs;
623    foreach my $line (__sorted_hash_keys($mailmap_hash)) {
624        my $line_num= $mailmap_hash->{$line};
625        $line =~ /^ \s* (?: ( [^<>]*? ) \s+ )? <([^<>]*)>
626                (?: \s+ (?: ( [^<>]*? ) \s+ )? <([^<>]*)> )? \s* \z /x
627            or die encode_utf8
628            "Failed to parse '$self->{mailmap_file}' line num $line_num: '$line'\n";
629        if (!$1 or !$2) {
630            die encode_utf8 "Both preferred name and email are mandatory ",
631                "in line num $line_num: '$line'";
632        }
633        my ($name, $email, $other_name, $other_email)= ($1, $2, $3, $4);
634        my ($orig_name, $orig_email)= ($1, $2);
635        if (my $new_name= $self->{change_name_for_name}{$orig_name}) {
636            $name= $new_name;
637        }
638        if (my $new_name= $self->{change_name_for_email}{$orig_email}) {
639            $name= $new_name;
640        }
641        if (my $new_email= $self->{change_email_for_name}{$orig_name}) {
642            $email= $new_email;
643        }
644        if (my $new_email= $self->{change_email_for_email}{$orig_email}) {
645            $email= $new_email;
646        }
647
648        push @recs, [ $name, $email, $other_name, $other_email, $line_num ];
649    }
650    return \@recs;
651}
652
653sub _safe_set_key {
654    my ($self, $hash, $root_key, $key, $val, $pretty_name)= @_;
655    $hash->{$root_key}{$key} //= $val;
656    my $prev= $hash->{$root_key}{$key};
657    if ($prev ne $val) {
658        die encode_utf8 "Collision on mapping $root_key: "
659            . " '$key' maps to '$prev' and '$val'\n";
660    }
661}
662
663my $O2P= "other2preferred";
664my $O2PN= "other2preferred_name";
665my $O2PE= "other2preferred_email";
666my $P2O= "preferred2other";
667my $N2P= "name2preferred";
668my $E2P= "email2preferred";
669
670my $blurb= "";    # FIXME - replace with a nice message
671
672sub known_contributor {
673    my ($self, $name, $email)= @_;
674    if (!$name or !$email) { return 0 }
675    my $combined= "$name <$email>";
676    return ((
677                   $self->{mailmap_info}{$O2P}{$combined}
678                && $self->_keeper_digest($combined)
679        ) ? 1 : 0
680    );
681}
682
683sub _check_name_mailmap {
684    my ($self, $auth_name, $raw_name, $commit_info, $descr)= @_;
685    my $mailmap_info= $self->{mailmap_info};
686
687    my $name= $self->_author_to_mailmap($auth_name);
688
689    my $digest= $self->_keeper_digest($name)
690        or return 1;    # known but ignore
691
692    my $name_info= $mailmap_info->{$P2O}{$name};
693
694    if (!$name_info || !$name_info->{$raw_name}) {
695        if ($self->{exclude_missing}) {
696            $self->_exclude_contrib($name, $digest);
697        }
698        else {
699            $mailmap_info->{add}{"$name $raw_name"}++;
700
701            warn encode_utf8 sprintf
702                "Unknown %s '%s' in commit %s '%s'\n%s",
703                $descr,
704                $name,
705                $commit_info->{"abbrev_hash"},
706                $commit_info->{"commit_subject"}, $blurb
707                if $self->{verbose};
708        }
709        return 0;
710    }
711    return 1;
712}
713
714sub _author_to_mailmap {
715    my ($self, $name)= @_;
716    $name =~ s/<([^<>]+)>/<\L$1\E>/
717        or $name =~ s/(\s)(\@\w+)\z/$1<\L$2\E>/
718        or $name .= " <unknown>";
719
720    $name= __fold_trim_ws($name);
721    return $name;
722}
723
724sub check_fix_mailmap_hash {
725    my ($self)= @_;
726    my $orig_mailmap_hash= $self->{orig_mailmap_hash};
727    my $author_info= $self->{author_info};
728    foreach my $key (keys %{ $author_info->{clean_full} }) {
729        $key .= " <unknown>"
730            unless $key =~ /\s+(?:<[^>]+>|\@\w+)\z/;
731        $key =~ s/\s+(\@\w+)\z/ <$1>/;
732        $orig_mailmap_hash->{"$key $key"} //= -1;
733    }
734    my $parsed= $self->parse_orig_mailmap_hash();
735    my @fixed;
736    my %seen_map;
737    my %pref_groups;
738
739    my $remove_no_names_with_overlaps= 0;
740
741    # first pass through the data, do any conversions, eg, LC
742    # the email address, decode any MIME-Header style email addresses.
743    # We also correct any preferred name entries so they match what
744    # we already have in AUTHORS, and check that there aren't collisions
745    # or other issues in the data.
746    foreach my $rec (@$parsed) {
747        my ($pname, $pemail, $oname, $oemail, $line_num)= @$rec;
748        $pemail= lc($pemail);
749        $oemail= lc($oemail) if defined $oemail;
750        if ($pname =~ /=\?UTF-8\?/) {
751            $pname= decode("MIME-Header", $pname);
752        }
753        my $auth_email= $author_info->{"name2email"}{$pname};
754        if ($auth_email) {
755            ## this name exists in authors, so use its email data for pemail
756            $pemail= $auth_email;
757        }
758        my $auth_name= $author_info->{"email2name"}{$pemail};
759        if ($auth_name) {
760            ## this email exists in authors, so use its name data for pname
761            $pname= $auth_name;
762        }
763
764        # neither name nor email exist in authors.
765        if ($pname ne "unknown") {
766            if (my $email= $seen_map{"name"}{$pname}) {
767                ## we have seen this pname before, check the pemail
768                ## is consistent
769                if ($email ne $pemail) {
770                    warn encode_utf8 "Inconsistent emails for name '$pname'"
771                        . " at line num $line_num: keeping '$email',"
772                        . " ignoring '$pemail'\n";
773                    $pemail= $email;
774                }
775            }
776            else {
777                $seen_map{"name"}{$pname}= $pemail;
778            }
779        }
780        if ($pemail ne "unknown") {
781            if (my $name= $seen_map{"email"}{$pemail}) {
782                ## we have seen this preferred_email before, check the preferred_name
783                ## is consistent
784                if ($name ne $pname) {
785                    warn encode_utf8 "Inconsistent name for email '$pemail'"
786                        . " at line num $line_num: keeping '$name', ignoring"
787                        . " '$pname'\n";
788                    $pname= $name;
789                }
790            }
791            else {
792                $seen_map{"email"}{$pemail}= $pname;
793            }
794        }
795
796        my $rec= [ $pname, $pemail, $oname, $oemail, $line_num ];
797        if ($remove_no_names_with_overlaps) {
798
799            # Build an index of "preferred name/email" to other-email, other name
800            # we use this later to remove redundant entries missing a name.
801            $pref_groups{"$pname $pemail"}{$oemail}{ $oname || "" }= $rec;
802        }
803        else {
804            push @fixed, $rec;
805        }
806    }
807
808    if ($remove_no_names_with_overlaps) {
809
810        # this removes entries like
811        # Joe <blogs> <whatever>
812        # where there is a corresponding
813        # Joe <blogs> Joe X <whatever>
814        foreach my $pref (__sorted_hash_keys(\%pref_groups)) {
815            my $entries= $pref_groups{$pref};
816            foreach my $email (__sorted_hash_keys($entries)) {
817                my @names= __sorted_hash_keys($entries->{$email});
818                if (0 and $names[0] eq "" and @names > 1) {
819                    shift @names;
820                }
821                foreach my $name (@names) {
822                    push @fixed, $entries->{$email}{$name};
823                }
824            }
825        }
826    }
827
828    # final pass through the dataset, build up a database
829    # we will use later for checks and updates, and reconstruct
830    # the canonical entries.
831    my $new_mailmap_hash= {};
832    my $mailmap_info=     {};
833    foreach my $rec (@fixed) {
834        my ($pname, $pemail, $oname, $oemail, $line_num)= @$rec;
835        my $preferred= "$pname <$pemail>";
836        my $other;
837        if (defined $oemail) {
838            $other= $oname ? "$oname <$oemail>" : "<$oemail>";
839        }
840        if (!$self->_keeper_digest($preferred)) {
841            $self->_exclude_contrib($other);
842            next;
843        }
844        elsif (!$self->_keeper_digest($other)) {
845            next;
846        }
847        if ($other and $other ne "<unknown>") {
848            $self->_safe_set_key($mailmap_info, $O2P,  $other, $preferred);
849            $self->_safe_set_key($mailmap_info, $O2PN, $other, $pname);
850            $self->_safe_set_key($mailmap_info, $O2PE, $other, $pemail);
851        }
852        $mailmap_info->{$P2O}{$preferred}{$other}++;
853        if ($pname ne "unknown") {
854            $self->_safe_set_key($mailmap_info, $N2P, $pname, $preferred);
855        }
856        if ($pemail ne "unknown") {
857            $self->_safe_set_key($mailmap_info, $E2P, $pemail, $preferred);
858        }
859        my $line= $preferred;
860        $line .= " $other" if $other;
861        $new_mailmap_hash->{$line}= $line_num;
862    }
863    $self->{new_mailmap_hash}= $new_mailmap_hash;
864    $self->{mailmap_info}= $mailmap_info;
865    return ($new_mailmap_hash, $mailmap_info);
866}
867
868sub add_new_mailmap_entries {
869    my ($self)= @_;
870    my $mailmap_hash= $self->{new_mailmap_hash};
871    my $mailmap_info= $self->{mailmap_info};
872    my $mailmap_file= $self->{mailmap_file};
873
874    my $mailmap_add= $mailmap_info->{add}
875        or return 0;
876
877    my $num= 0;
878    for my $new (__sorted_hash_keys($mailmap_add)) {
879        !$mailmap_hash->{$new}++ or next;
880        warn encode_utf8 "Updating '$mailmap_file' with: $new\n"
881            if $self->{verbose};
882        $num++;
883    }
884    return $num;
885}
886
887sub read_and_update {
888    my ($self)= @_;
889    my ($authors_file, $mailmap_file)=
890        %{$self}{qw(authors_file mailmap_file)};
891
892    # read the authors file and extract the info it contains
893    $self->read_authors_file();
894
895    # read the mailmap file.
896    $self->read_mailmap_file();
897
898    # check and possibly fix the mailmap data, and build a set of precomputed
899    # datasets to work with it.
900    $self->check_fix_mailmap_hash();
901
902    # update the mailmap based on any check or fixes we just did.
903    $self->update_mailmap_file();
904
905    # read the commits names using git log, and compares and checks
906    # them against the data we have in authors.
907    $self->read_commit_log();
908
909    # update the authors file with any changes
910    $self->update_authors_file();
911
912    # check if we discovered new email data from the commits that
913    # we need to write back to disk.
914    $self->add_new_mailmap_entries()
915        and $self->update_mailmap_file();
916
917    $self->update_exclude_file();
918
919    return $self->changed_count();
920}
921
922sub read_exclude_file {
923    my ($self)= @_;
924    my $exclude_file= $self->{exclude_file};
925    my $exclude_digest= $self->{exclude_digest} ||= {};
926
927    open my $in_fh, "<", $exclude_file
928        or do {
929        warn "Failed to open '$exclude_file': $!";
930        return;
931        };
932    my $head= "";
933    my $orig= "";
934    my $seen_data= 0;
935    while (defined(my $line= <$in_fh>)) {
936        $orig .= $line;
937        if ($line =~ /^\s*#/ || $line !~ /\S/) {
938            $head .= $line unless $seen_data;
939            next;
940        }
941        else {
942            $seen_data= 1;
943        }
944        chomp($line);
945        $line =~ s/\A\s+//;
946        $line =~ s/\s*(?:#.*)?\z//;
947        $exclude_digest->{$line}++ if length($line);
948    }
949    close $in_fh
950        or die "Failed to close '$exclude_file' after reading: $!";
951    if (!$head) {
952        die sprintf <<'EOFMT', $exclude_file;
953Possibly corrupted exclude file '%s'.
954
955This file should have a header of '#' comments in it.
956
957Where did they go?
958
959Cowardly refusing to continue until this is fixed.
960EOFMT
961    }
962    $self->{exclude_file_text_head}= $head;
963    $self->{exclude_file_text_orig}= $orig;
964
965    return $exclude_digest;
966}
967
968sub update_exclude_file {
969    my ($self)= @_;
970    my $exclude_file= $self->{exclude_file};
971    my $exclude_text= $self->{exclude_file_text_head};
972    foreach my $digest (__sorted_hash_keys($self->{exclude_digest})) {
973        $exclude_text .= "$digest\n";
974    }
975    if ($exclude_text ne $self->{exclude_file_text_orig}) {
976        $self->{changed_count}++;
977        $self->_log_file_changes_quick_and_dirty_diff($exclude_file,
978            $self->{exclude_file_text_orig},
979            $exclude_text);
980
981        if ($self->{no_update}) {
982            return 1;
983        }
984
985        warn "Updating '$exclude_file'\n" if $self->{verbose};
986
987        my $tmp_file= "$exclude_file.new";
988        open my $out_fh, ">", $tmp_file
989            or die "Cant open '$tmp_file' for write $!";
990        print $out_fh $exclude_text
991            or die "Failed to print to '$tmp_file': $!";
992        close $out_fh
993            or die "Failed to close '$tmp_file' after writing: $!";
994        rename $tmp_file, $exclude_file
995            or die "Failed to rename '$tmp_file' to '$exclude_file': $!";
996
997        return 1;
998    }
999    else {
1000        return 0;
1001    }
1002}
1003
1004sub changed_count {
1005    my ($self)= @_;
1006    return $self->{changed_count};
1007}
1008
1009sub changed_file {
1010    my ($self, $name)= @_;
1011    return $self->{changed_file}{$name};
1012}
1013
1014sub unchanged_file {
1015    my ($self, $name)= @_;
1016    return $self->changed_file($name) ? 0 : 1;
1017}
1018
1019sub new {
1020    my ($class, %self)= @_;
1021    $self{changed_count}= 0;
1022    for my $name (qw(authors_file mailmap_file exclude_file)) {
1023        $self{$name}
1024            or die "Property '$name' is mandatory in constructor";
1025    }
1026
1027    my $self= bless \%self, $class;
1028
1029    if (my $ary= $self->{exclude_contrib}) {
1030        $self->_exclude_contrib($_) for @$ary;
1031    }
1032
1033    $self->read_exclude_file();
1034
1035    die Dumper(\%self) if $self{dump_opts};
1036
1037    return $self;
1038}
1039
1040sub __digest {
1041    my $thing= $_[0];
1042    utf8::encode($thing);
1043    return sha256_base64($thing);
1044}
1045
1046# if this name is a "keeper" then return its digest
1047# (if we know the digest and it is marked for exclusion
1048# then we return 0)
1049sub _keeper_digest {
1050    my ($self, $real_name)= @_;
1051    my $digest;
1052    $digest= $self->{digest_cache}{$real_name};
1053
1054    if (!$digest) {
1055        my $name= __fold_trim_ws($real_name);
1056
1057        $digest= ($self->{digest_cache}{$name} //= __digest($name));
1058        $self->{digest_cache}{$real_name}= $digest;
1059    }
1060
1061    return $self->{exclude_digest}{$digest} ? 0 : $digest;
1062}
1063
1064# should we exclude this author from the AUTHORS file
1065# simply because of the form of their details?
1066sub _logical_exclude_author {
1067    my ($self, $author)= @_;
1068
1069    # don't know the persona
1070    return 1 if $author =~ /^unknown/;
1071
1072    # Someone at <unknown> with a single word name.
1073    # Eg, we wont list "Bob <unknown>"
1074    if ($author =~ s/\s*<unknown>\z//) {
1075        return 1 if $author =~ /^\w+$/;
1076    }
1077    return 0;
1078}
1079
1080# exclude this contributor by name, if digest isnt provided
1081# then it is computed using _digest.
1082sub _exclude_contrib {
1083    my ($self, $name, $digest)= @_;
1084
1085    # if we would exclude them anyway due to the logical
1086    # naming rules then we do not need to add them to the exclude
1087    # file.
1088    return if $self->_logical_exclude_author($name);
1089    $name= __fold_trim_ws($name);
1090    $digest //= __digest($name);
1091    $self->{exclude_digest}{$digest}++
1092        or warn "Excluding '$name' with '$digest'\n";
1093}
1094
1095sub _log_file_changes_quick_and_dirty_diff {
1096    my ($self, $file, $old_raw_text, $new_raw_text)= @_;
1097
1098    my %old;
1099    $old{$_}++ for split /\n/, $old_raw_text;
1100    my %new;
1101    $new{$_}++ for split /\n/, $new_raw_text;
1102    foreach my $key (keys %new) {
1103        delete $new{$key} if delete $old{$key};
1104    }
1105    $self->{changed_file}{$file}{add}= \%new if keys %new;
1106    $self->{changed_file}{$file}{del}= \%old if keys %old;
1107    return $self->{changed_file}{$file};
1108}
1109
1110sub _diff_diag {
1111    my ($self, $want_file)= @_;
1112    my $diag_str= "";
1113    foreach my $file (sort keys %{ $self->{changed_file} || {} }) {
1114        next if $want_file and $file ne $want_file;
1115        $diag_str .= "  File '$file' changes:\n";
1116        foreach my $action (sort keys %{ $self->{changed_file}{$file} }) {
1117            foreach
1118                my $line (sort keys %{ $self->{changed_file}{$file}{$action} })
1119            {
1120                $diag_str .= "    would $action: $line\n";
1121            }
1122        }
1123    }
1124    return $diag_str;
1125}
1126
1127my %pretty_name= (
1128    "author"         => "Authored",
1129    "committer"      => "Committed",
1130    "applied"        => "Applied",
1131    "name"           => "Name",
1132    "pos"            => "Pos",
1133    "num_files"      => "NFiles",
1134    "lines_added"    => "L++",
1135    "lines_removed"  => "L--",
1136    "lines_delta"    => "L+-",
1137    "binary_changed" => "Bin+-",
1138);
1139
1140sub report_stats {
1141    my ($self, $stats_key, @types)= @_;
1142    my @extra= "name";
1143    my @rows;
1144    my @total;
1145    foreach my $name (__sorted_hash_keys($self->{$stats_key})) {
1146        my @data= map { $self->{$stats_key}{$name}{$_} // 0 } @types;
1147        $total[$_] += $data[$_] for 0 .. $#data;
1148        push @data, $name;
1149        push @rows, \@data if $data[0];
1150    }
1151    @rows= sort {
1152        my $cmp= 0;
1153        for (0 .. $#$a - 1) {
1154            $cmp= $b->[$_] <=> $a->[$_];
1155            last if $cmp;
1156        }
1157        $cmp ||= $Collate->cmp($a->[-1], $b->[-1]);
1158        $cmp
1159    } @rows;
1160    @rows= reverse @rows if $self->{in_reverse};
1161
1162    if ($self->{as_cumulative}) {
1163        my $sum= [];
1164        for my $row (@rows) {
1165            do {
1166                $sum->[$_] += $row->[$_];
1167                $row->[$_]= $sum->[$_];
1168                }
1169                for 0 .. $#types;
1170        }
1171    }
1172
1173    if ($self->{as_percentage}) {
1174        for my $row (@rows) {
1175            $row->[$_]= sprintf "%.2f", ($row->[$_] / $total[$_]) * 100
1176                for 0 .. $#types;
1177        }
1178    }
1179
1180    foreach my $row (@rows) {
1181        my $name= $row->[-1];
1182        $name =~ s/\s+<.*\z//;
1183        $name =~ s/\s+\@.*\z//;
1184        $row->[-1]= $name;
1185    }
1186    my @col_names= map { $pretty_name{$_} // $_ } @types;
1187    if ($self->{as_percentage}) {
1188        $_= "%$_" for @col_names;
1189    }
1190    push @col_names, map { $pretty_name{$_} // $_ } @extra;
1191
1192    if ($self->{as_list} && @types == 1) {
1193        $self->_report_list(\@rows, \@types, \@extra, \@col_names);
1194    }
1195    else {
1196        $self->_report_table(\@rows, \@types, \@extra, \@col_names);
1197    }
1198}
1199
1200sub _report_table {
1201    my ($self, $rows, $types, $extra, $col_names)= @_;
1202    my $pos= 1;
1203    unshift @$_,         $pos++ for @$rows;
1204    unshift @$col_names, "Pos";
1205    my @width= (0) x @$col_names;
1206    foreach my $row ($col_names, @$rows) {
1207        for my $idx (0 .. $#$row) {
1208            $width[$idx] < length($row->[$idx])
1209                and $width[$idx]= length($row->[$idx]);
1210        }
1211    }
1212    $width[-1]= 40 if $width[-1] > 40;
1213    $width[$_]= -$width[$_] for 0, -1;
1214    my $fmt= "#" . join(" | ", ("%*s") x @$col_names) . "\n";
1215    my $bar_fmt= "#" . join("-+-", ("%*s") x @$col_names) . "\n";
1216    printf $fmt,     map { $width[$_], $col_names->[$_] } 0 .. $#width;
1217    printf $bar_fmt, map { $width[$_], "-" x abs($width[$_]) } 0 .. $#width;
1218    for my $idx (0 .. $#$rows) {
1219        my $row= $rows->[$idx];
1220        print encode_utf8 sprintf $fmt,
1221            map { $width[$_], $row->[$_] } 0 .. $#width;
1222    }
1223}
1224
1225sub _report_list {
1226    my ($self, $rows, $types, $extra, $col_names)= @_;
1227    my %hash;
1228    foreach my $row (@$rows) {
1229        $hash{ $row->[0] }{ $row->[-1] }++;
1230    }
1231    my @vals= sort { $b <=> $a } keys %hash;    # numeric sort
1232    my $width= length($col_names->[0]);
1233    $width < length($_) and $width= length($_) for @vals;
1234    @vals= reverse @vals if $self->{in_reverse};
1235
1236    my $hdr_str= sprintf "%*s | %s", $width, $col_names->[0], $col_names->[-1];
1237    my $sep_str= sprintf "%*s-+-%s", $width, "-" x $width, "-" x 40;
1238    my $fmt= "%*s | %s";
1239
1240    if ($self->{with_rank_numbers}) {
1241        $hdr_str= sprintf "#%*s | %s", -length(0 + @$rows), "Pos", $hdr_str;
1242        $sep_str= sprintf "#%*s-+-%s", -length(0 + @$rows),
1243            "-" x length(0 + @$rows), $hdr_str;
1244    }
1245    print $hdr_str, "\n";
1246    print $sep_str, "\n";
1247    my $pos= 1;
1248    foreach my $val (@vals) {
1249        my $val_f= sprintf "%*s | ", $width, $val;
1250        $val_f= sprintf "#%*d | %s", -length(0 + @$rows), $pos++, $val_f
1251            if $self->{with_rank_numbers};
1252        print encode_utf8 wrap $val_f,
1253            " " x length($val_f),
1254            join(", ", $Collate->sort(keys %{ $hash{$val} })) . "\n";
1255    }
1256}
1257
1258sub _filter_sort_who {
1259    my ($self, $hash)= @_;
1260    my @who;
1261    foreach my $name ($Collate->sort(keys %$hash)) {
1262        $name =~ s/\s+<.*\z//;
1263        $name =~ s/\s+\@.*\z//;
1264        push @who, $name if length $name and lc($name) ne "unknown";
1265    }
1266    return @who;
1267}
1268
1269sub print_who {
1270    my ($self)= @_;
1271    my @who= $self->_filter_sort_who($self->{who_stats});
1272    print encode_utf8 wrap "", "", join(", ", @who) . ".\n";
1273}
1274
12751;
1276__END__
1277
1278=head1 NAME
1279
1280Porting::updateAUTHORS - Library to automatically update AUTHORS and .mailmap based on commit data.
1281
1282=head1 SYNOPSIS
1283
1284    use Porting::updateAUTHORS;
1285
1286    my $updater= Porting::updateAUTHORS->new(
1287        authors_file => "AUTHORS",
1288        mailmap_file => ".mailmap",
1289        exclude_file => "Porting/exclude_contrib.txt",
1290    );
1291    $updater->read_and_update();
1292
1293=head1 DESCRIPTION
1294
1295This the brain of the F<Porting/updateAUTHORS.pl> script. It is expected
1296to be used B<from> that script and B<by> that script. Most features and
1297options are documented in the F<Porting/updateAUTHORS.pl> and are not
1298explicitly documented here, read the F<Porting/updateAUTHORS.pl> manpage
1299for more details.
1300
1301=head1 METHODS
1302
1303Porting::updateAUTHORS uses OO as way of managing its internal state.
1304This documents the public methods it exposes.
1305
1306=over 4
1307
1308=item add_new_mailmap_entries()
1309
1310If any additions were identified while reading the commits this will
1311inject them into the mailmap_hash so they can be written out. Returns a
1312count of additions found.
1313
1314=item check_fix_mailmap_hash()
1315
1316Analyzes the data contained the in the .mailmap file and applies any
1317automated fixes which are required and which it can automatically
1318perform. Returns a hash of adjusted entries and a hash with additional
1319metadata about the mailmap entries.
1320
1321=item new(%opts)
1322
1323Create a new object. Required parameters are
1324
1325    authors_file
1326    mailmap_file
1327    exclude_file
1328
1329Other supported parameters are as follows:
1330
1331    verbose
1332    commit_range
1333
1334this list is not exhaustive. See the code implementing the main()
1335function in F<Porting/updateAUTHORS.pl> for an exhaustive list.
1336
1337=item parse_orig_mailmap_hash()
1338
1339Takes a mailmap_hash and parses it and returns it as an array of array
1340records with the contents:
1341
1342    [ $preferred_name, $preferred_email,
1343      $other_name, $other_email,
1344      $line_num ]
1345
1346=item read_and_update()
1347
1348Wraps the other functions in this library and implements the logic and
1349intent of this tool. Takes two arguments, the authors file name, and the
1350mailmap file name. Returns nothing but may modify the AUTHORS file
1351or the .mailmap file. Requires that both files are editable.
1352
1353=item read_commit_log()
1354
1355Read the commit log specified by the property "commit_range" and find
1356any new names it contains.
1357
1358Normally used via C<read_and_update> and not called directly.
1359
1360=item read_authors_file()
1361
1362Read the AUTHORS file into the object, and return data about it.
1363
1364Normally used via C<read_and_update> and not called directly.
1365
1366=item read_mailmap_file()
1367
1368Read the .mailmap file into the object and return data about it.
1369
1370Normally used via C<read_and_update> and not called directly.
1371
1372=item read_exclusion_file()
1373
1374Read the exclusion file into the object and return data about it.
1375
1376Normally used via C<read_and_update> and not called directly.
1377
1378=item update_authors_file()
1379
1380Write out an updated AUTHORS file atomically if it has changed,
1381returns 0 if the file was actually updated, 1 if it was not.
1382
1383Normally used via C<read_and_update> and not called directly.
1384
1385=item update_mailmap_file()
1386
1387Write out an updated .mailmap file atomically if it has changed,
1388returns 0 if the file was actually updated, 1 if it was not.
1389
1390Normally used via C<read_and_update> and not called directly.
1391
1392=item update_exclusion_file()
1393
1394Write out an updated exclusion file atomically if it has changed,
1395returns 0 if the file was actually update, 1 if it was not.
1396
1397Normally used via C<read_and_update> and not called directly.
1398
1399=back
1400
1401=head1 TODO
1402
1403More documentation and testing.
1404
1405=head1 SEE ALSO
1406
1407F<Porting/checkAUTHORS.pl>
1408
1409=head1 AUTHOR
1410
1411Yves Orton <demerphq@gmail.com>
1412
1413=cut
1414