GHA: spellcheck curl symbols better

This now makes sure to trim off exact matches for curl symbols and long
curl commanad line options instead of using pattern matching as before.
This should catch typoed names (that still follow the pattern) better.

The cleanspell.pl script is no longer used. cleancmd.pl is used for all
markdown files.

Closes #16504
This commit is contained in:
Daniel Stenberg 2025-02-27 11:17:42 +01:00
parent a8ad9a5758
commit 7007f59caa
No known key found for this signature in database
GPG Key ID: 5CC908FDB71E12C2
4 changed files with 255 additions and 140 deletions

View File

@ -3,55 +3,117 @@
#
# SPDX-License-Identifier: curl
#
# Input: a cmdline docs markdown, it gets modified *in place*
# Input: cmdline docs markdown files, they get modified *in place*
#
# Strip off the leading meta-data/header part, remove all known curl symbols
# and long command line options. Also clean up whatever else the spell checker
# might have a problem with that we still deem is fine.
#
# The main purpose is to strip off the leading meta-data part, but also to
# clean up whatever else the spell checker might have a problem with that we
# still deem is fine.
my $header = 1;
while(1) {
# set this if the markdown has no meta-data header to skip
if($ARGV[0] eq "--no-header") {
shift @ARGV;
$header = 0;
}
else {
last;
open(S, "<./docs/libcurl/symbols-in-versions")
|| die "can't find symbols-in-versions";
while(<S>) {
if(/^([^ ]*) /) {
push @asyms, $1;
}
}
close(S);
my $f = $ARGV[0];
# init the opts table with "special" options not easy to figure out
my @aopts = (
'--ftp-ssl-reqd', # old alias
);
open(F, "<$f") or die;
my $ignore = $header;
my $sepcount = 0;
my @out;
while(<F>) {
if(/^---/ && $header) {
if(++$sepcount == 2) {
$ignore = 0;
open(O, "<./docs/options-in-versions")
|| die "can't find options-in-versions";
while(<O>) {
chomp;
if(/^([^ ]+)/) {
my $o = $1;
push @aopts, $o;
if($o =~ /^--no-(.*)/) {
# for the --no options, also make one without it
push @aopts, "--$1";
}
elsif($o =~ /^--disable-(.*)/) {
# for the --disable options, also make the special ones
push @aopts, "--$1";
push @aopts, "--no-$1";
}
}
}
close(O);
open(C, "<./.github/scripts/spellcheck.curl")
|| die "can't find spellcheck.curl";
while(<C>) {
if(/^\#/) {
next;
}
next if($ignore);
# strip out backticked words
$_ =~ s/`[^`]+`//g;
# strip out all long command line options
$_ =~ s/--[a-z0-9-]+//g;
# strip out https URLs, we don't want them spellchecked
$_ =~ s!https://[a-z0-9\#_/.-]+!!gi;
push @out, $_;
chomp;
if(/^([^ ]+)/) {
push @asyms, $1;
}
}
close(F);
close(C);
if(!$ignore) {
open(O, ">$f") or die;
print O @out;
close(O);
# longest symbols first
my @syms = sort { length($b) <=> length($a) } @asyms;
# longest cmdline options first
my @opts = sort { length($b) <=> length($a) } @aopts;
sub process {
my ($f) = @_;
my $ignore = 0;
my $sepcount = 0;
my $out;
my $line = 0;
open(F, "<$f") or die;
while(<F>) {
$line++;
if(/^---/ && ($line == 1)) {
$ignore = 1;
next;
}
elsif(/^---/ && $ignore) {
$ignore = 0;
next;
}
next if($ignore);
my $l = $_;
# strip out backticked words
$l =~ s/`[^`]+`//g;
# **bold**
$l =~ s/\*\*(\S.*?)\*\*//g;
# *italics*
$l =~ s/\*(\S.*?)\*//g;
# strip out https URLs, we don't want them spellchecked
$l =~ s!https://[a-z0-9\#_/.-]+!!gi;
$out .= $l;
}
close(F);
# cut out all known curl cmdline options
map { $out =~ s/$_//g; } (@opts);
# cut out all known curl symbols
map { $out =~ s/\b$_\b//g; } (@syms);
if(!$ignore) {
open(O, ">$f") or die;
print O $out;
close(O);
}
}
for my $f (@ARGV) {
process($f);
}

View File

@ -1,86 +0,0 @@
#!/usr/bin/env perl
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
#
# SPDX-License-Identifier: curl
#
# Given: a libcurl curldown man page
# Outputs: the same file, minus the SYNOPSIS and the EXAMPLE sections
#
my $f = $ARGV[0];
open(F, "<$f") or die;
my @out;
my $ignore = 0;
while(<F>) {
if($_ =~ /^# (SYNOPSIS|EXAMPLE)/) {
$ignore = 1;
}
elsif($ignore && ($_ =~ /^# [A-Z]/)) {
$ignore = 0;
}
elsif(!$ignore) {
# **bold**
$_ =~ s/\*\*(\S.*?)\*\*//g;
# *italics*
$_ =~ s/\*(\S.*?)\*//g;
$_ =~ s/CURL(M|SH|U|H)code//g;
$_ =~ s/CURL_[A-Z0-9_]*//g;
$_ =~ s/CURLALTSVC_[A-Z0-9_]*//g;
$_ =~ s/CURLAUTH_[A-Z0-9_]*//g;
$_ =~ s/CURLE_[A-Z0-9_]*//g;
$_ =~ s/CURLFORM_[A-Z0-9_]*//g;
$_ =~ s/CURLFTP_[A-Z0-9_]*//g;
$_ =~ s/CURLFTPAUTH_[A-Z0-9_]*//g;
$_ =~ s/CURLFTPMETHOD_[A-Z0-9_]*//g;
$_ =~ s/CURLFTPSSL_[A-Z0-9_]*//g;
$_ =~ s/CURLGSSAPI_[A-Z0-9_]*//g;
$_ =~ s/CURLHEADER_[A-Z0-9_]*//g;
$_ =~ s/CURLINFO_[A-Z0-9_]*//g;
$_ =~ s/CURLM_[A-Z0-9_]*//g;
$_ =~ s/CURLMIMEOPT_[A-Z0-9_]*//g;
$_ =~ s/CURLMOPT_[A-Z0-9_]*//g;
$_ =~ s/CURLOPT_[A-Z0-9_]*//g;
$_ =~ s/CURLPIPE_[A-Z0-9_]*//g;
$_ =~ s/CURLPROTO_[A-Z0-9_]*//g;
$_ =~ s/CURLPROXY_[A-Z0-9_]*//g;
$_ =~ s/CURLPX_[A-Z0-9_]*//g;
$_ =~ s/CURLSHE_[A-Z0-9_]*//g;
$_ =~ s/CURLSHOPT_[A-Z0-9_]*//g;
$_ =~ s/CURLSSLOPT_[A-Z0-9_]*//g;
$_ =~ s/CURLSSH_[A-Z0-9_]*//g;
$_ =~ s/CURLSSLBACKEND_[A-Z0-9_]*//g;
$_ =~ s/CURLU_[A-Z0-9_]*//g;
$_ =~ s/CURLUPART_[A-Z0-9_]*//g;
#$_ =~ s/\bCURLU\b//g; # stand-alone CURLU
$_ =~ s/CURLUE_[A-Z0-9_]*//g;
$_ =~ s/CURLHE_[A-Z0-9_]*//g;
$_ =~ s/CURLWS_[A-Z0-9_]*//g;
$_ =~ s/CURLKH[A-Z0-9_]*//g;
$_ =~ s/CURLUPART_[A-Z0-9_]*//g;
$_ =~ s/CURLUSESSL_[A-Z0-9_]*//g;
$_ =~ s/CURLPAUSE_[A-Z0-9_]*//g;
$_ =~ s/CURLHSTS_[A-Z0-9_]*//g;
$_ =~ s/curl_global_([a-z_]*)//g;
$_ =~ s/curl_(strequal|strnequal|formadd|waitfd|formget|getdate|formfree)//g;
$_ =~ s/curl_easy_([a-z]*)//g;
$_ =~ s/curl_multi_([a-z_]*)//g;
$_ =~ s/curl_mime_(subparts|addpart|filedata|data_cb)//g;
$_ =~ s/curl_ws_(send|recv|meta)//g;
$_ =~ s/curl_url_(dup)//g;
$_ =~ s/curl_pushheader_by(name|num)//g;
$_ =~ s/libcurl-(env|ws)//g;
$_ =~ s/libcurl\\-(env|ws)//g;
$_ =~ s/(^|\W)((tftp|https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&'()*+,;=\\]+)//gi;
push @out, $_;
}
}
close(F);
open(O, ">$f") or die;
for my $l (@out) {
print O $l;
}
close(O);

151
.github/scripts/spellcheck.curl vendored Normal file
View File

@ -0,0 +1,151 @@
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
#
# SPDX-License-Identifier: curl
#
# common variable types + structs
# callback typedefs
# public functions names
# some man page names
curl_fileinfo
curl_forms
curl_hstsentry
curl_httppost
curl_index
curl_khkey
curl_pushheaders
curl_waitfd
CURLcode
CURLformoption
CURLHcode
CURLMcode
CURLMsg
CURLSHcode
CURLUcode
curl_calloc_callback
curl_chunk_bgn_callback
curl_chunk_end_callback
curl_conv_callback
curl_debug_callback
curl_fnmatch_callback
curl_formget_callback
curl_free_callback
curl_hstsread_callback
curl_hstswrite_callback
curl_ioctl_callback
curl_malloc_callback
curl_multi_timer_callback
curl_opensocket_callback
curl_prereq_callback
curl_progress_callback
curl_push_callback
curl_read_callback
curl_realloc_callback
curl_resolver_start_callback
curl_seek_callback
curl_socket_callback
curl_sockopt_callback
curl_ssl_ctx_callback
curl_strdup_callback
curl_trailer_callback
curl_write_callback
curl_xferinfo_callback
curl_strequal
curl_strnequal
curl_mime_init
curl_mime_free
curl_mime_addpart
curl_mime_name
curl_mime_filename
curl_mime_type
curl_mime_encoder
curl_mime_data
curl_mime_filedata
curl_mime_data_cb
curl_mime_subparts
curl_mime_headers
curl_formadd
curl_formget
curl_formfree
curl_getdate
curl_getenv
curl_version
curl_easy_escape
curl_escape
curl_easy_unescape
curl_unescape
curl_free
curl_global_init
curl_global_init_mem
curl_global_cleanup
curl_global_trace
curl_global_sslset
curl_slist_append
curl_slist_free_all
curl_getdate
curl_share_init
curl_share_setopt
curl_share_cleanup
curl_version_info
curl_easy_strerror
curl_share_strerror
curl_easy_pause
curl_easy_ssls_import
curl_easy_ssls_export
curl_easy_init
curl_easy_setopt
curl_easy_perform
curl_easy_cleanup
curl_easy_getinfo
curl_easy_duphandle
curl_easy_reset
curl_easy_recv
curl_easy_send
curl_easy_upkeep
curl_easy_header
curl_easy_nextheader
curl_mprintf
curl_mfprintf
curl_msprintf
curl_msnprintf
curl_mvprintf
curl_mvfprintf
curl_mvsprintf
curl_mvsnprintf
curl_maprintf
curl_mvaprintf
curl_multi_init
curl_multi_add_handle
curl_multi_remove_handle
curl_multi_fdset
curl_multi_waitfds
curl_multi_wait
curl_multi_poll
curl_multi_wakeup
curl_multi_perform
curl_multi_cleanup
curl_multi_info_read
curl_multi_strerror
curl_multi_socket
curl_multi_socket_action
curl_multi_socket_all
curl_multi_timeout
curl_multi_setopt
curl_multi_assign
curl_multi_get_handles
curl_pushheader_bynum
curl_pushheader_byname
curl_multi_waitfds
curl_easy_option_by_name
curl_easy_option_by_id
curl_easy_option_next
curl_url
curl_url_cleanup
curl_url_dup
curl_url_get
curl_url_set
curl_url_strerror
curl_ws_recv
curl_ws_send
curl_ws_meta
libcurl-env
libcurl-ws

View File

@ -107,20 +107,8 @@ jobs:
persist-credentials: false
name: checkout
- name: trim all man page *.md files
run: find docs -name "*.md" ! -name "_*" -print0 | xargs -0 -n1 .github/scripts/cleancmd.pl
- name: trim libcurl man page *.md files
run: find docs/libcurl \( -name "curl_*.md" -o -name "libcurl*.md" \) -print0 | xargs -0 -n1 .github/scripts/cleanspell.pl
- name: trim libcurl option man page *.md files
run: find docs/libcurl/opts -name "CURL*.md" -print0 | xargs -0 -n1 .github/scripts/cleanspell.pl
- name: trim cmdline docs markdown _*.md files
run: find docs/cmdline-opts -name "_*.md" -print0 | xargs -0 -n1 .github/scripts/cleancmd.pl --no-header
- name: trim docs/ markdown _*.md files
run: git ls-files docs/*.md docs/internals/*.md | xargs -n1 .github/scripts/cleancmd.pl --no-header
- name: trim all *.md files in docs/
run: .github/scripts/cleancmd.pl $(find docs -name "*.md")
- name: setup the custom wordlist
run: grep -v '^#' .github/scripts/spellcheck.words > wordlist.txt