diff options
| -rw-r--r-- | _data/scripts.yaml | 9 | ||||
| -rw-r--r-- | scripts/urlinfo.pl | 397 |
2 files changed, 406 insertions, 0 deletions
diff --git a/_data/scripts.yaml b/_data/scripts.yaml index ec8af2a..c173eca 100644 --- a/_data/scripts.yaml +++ b/_data/scripts.yaml @@ -4220,3 +4220,12 @@ license: "WTFPL" name: "unicode" version: "1" + +- authors: "David Leadbeater" + contact: "dgl@dgl.cx" + description: "Print short summaries about URLs from known services that are mentioned on IRC. (Including YouTube, etc.)" + filename: "urlinfo.pl" + modified: "2014-08-10" + license: "WTFPL" + name: "urlinfo" + version: "1.1" diff --git a/scripts/urlinfo.pl b/scripts/urlinfo.pl new file mode 100644 index 0000000..8715ebb --- /dev/null +++ b/scripts/urlinfo.pl @@ -0,0 +1,397 @@ +use 5.014; +use utf8; +use Irssi; +use POSIX (); + +our $VERSION = "1.1"; +our %IRSSI = ( + authors => 'David Leadbeater', + contact => 'dgl@dgl.cx', + name => 'urlinfo', + description => 'Print short summaries about URLs from known services that are mentioned on IRC. (Including YouTube, etc.)', + license => 'WTFPL <http://dgl.cx/licence>', + url => 'http://dgl.cx/irssi', +); + +# This is needed so it can still run standalone for testing. +BEGIN { + Irssi->import(20140628) if __PACKAGE__ =~ /Irssi/; # This needs Irssi 0.8.17 +} + +BEGIN { + eval { + require HTML::TreeBuilder; + require URI; + } or do { + print "\x{3}8You need to install HTML::TreeBuilder and URI"; + if (-f "/etc/debian_version") { + print "Try running: \x{3}9sudo apt-get install libhtml-treebuilder-perl libwww-perl"; + } + die $@; + } +} + +# ------- Settings +# /SET urlinfo_title_unknown ON|OFF +# Show title of all unknown sites. (There is a timeout to prevent against +# obvious resource exhaustion attacks, but remember this script has no +# warranty.) +# +# /SET urlinfo_timeout 10 +# How many seconds after which to give up trying to fetch a URL +# +# /SET urlinfo_ignore_domains example\.org example\.com +# Space separated list of regular expressions of domains to ignore +# +# /SET urlinfo_custom_domains my\.domain/thing irssi\.org=description +# A limited way of configuring custom domains, if you need something more +# complex edit SITES below. +# Format: domain[/path[=from]] + +# ------- Sites configuration + +# This script aims to be data driven, this hash has "site_name => {site details}" +# site details is a hash reference which can contain: +# cleanup: A regexp of text to remove from the resulting string +# domain: A string (or regexp, with qr//) of the domain to match (www. is +# removed automatically). +# from: Where to read the info "title", "description" (meta description) or +# a regexp to match the content (default "title") +# example: Example of this URL +# expected: What the example should return (see end for testing) +# items: An array ref of additional hashes to allow multiple of these values +# e.g.: items => [ { domain => "example.com" } ] +# path: Path component (string or regexp) +# +my %SITES = ( + vimeo => { + cleanup => qr/\s*on Vimeo$/, + domain => "vimeo.com", + path => qr{/\d+}, + example => "http://vimeo.com/80871338", + expected => "Journey Part 1", + }, + youtube => { + cleanup => qr/\s*-\s*YouTube$/, + items => [ + { + domain => "youtu.be", + example => "http://youtu.be/wa1c6EU2bY0", + expected => "I Am The Resurrection (Remastered)", + }, + { + domain => "youtube.com", + path => "/watch", + example => "https://www.youtube.com/watch?v=q99JgYrgzco&list=PLE57B71744156439A", + expected => "I Wanna Be Adored (Remastered)", + }, + ], + }, + metacpan => { + cleanup => qr/\s*-\s*metacpan\.org$/, + domain => "metacpan.org", + path => qr{^/pod/}, + example => "https://metacpan.org/pod/release/DOY/Reply-0.34/lib/Reply.pm", + from => "description", + expected => "read, eval, print, loop, yay!", + }, + pypi => { + domain => "pypi.python.org", + path => qr{^/pypi/}, + from => "description", + example => "https://pypi.python.org/pypi/stanford-corenlp-python/3.3.6-0", + expected => "A Stanford Core NLP wrapper (wordseer fork)", + }, + gist => { + cleanup => qr/\s*-\s*Gist is .*$/, + domain => "gist.github.com", + from => ["og:title", "description"], + example => "https://gist.github.com/dgl/792206", + expected => "dgl/installblead: An install script that installs a development version of perl (from git) and keeps a particular set of modules installed. Sort of perlbrew for blead, but not quite.", + }, + github => { + domain => "github.com", + items => [ + { # issue, commit or pull + cleanup => qr/\s*·.*$/, + path => qr{^/[^/]+/[^/]+/(?:issues|commit|pull)/[a-f0-9]}, + example => "https://github.com/irssi/irssi/commit/669add", + expected => "FS#155 hilight -tag", + }, + { # user or project + from => "og:description", + path => qr{^/[^/]+(?:/[^/]+)?$}, + example => "https://github.com/irssi/irssi", + expected => "irssi - The client of the future", + }, + ], + }, +); + +# ------- Site handling + +sub expand { + my @expanded_sites; + for my $site(keys %SITES) { + expand_site(\@expanded_sites, $site, $SITES{$site}, {}); + } + return @expanded_sites; +} + +# This essentially implements inheritance (via the "items" key), to reduce +# duplication. +sub expand_site { + my($expanded_sites, $site, $site_data, $current) = @_; + delete $current->{items}; + my $s = { + name => $site, + from => "title", + %$current, + %$site_data, + }; + if (exists $s->{items}) { + expand_site($expanded_sites, $site, $_, $s) for @{$s->{items}}; + } else { + push @$expanded_sites, $s; + } +} + +sub _matcher { + my($site, $item) = @_; + return 1 unless defined $site; + return 1 if ref $site eq 'ARRAY' && grep _matcher($_, $item), @$site; + return 1 if ref $site && $site->isa("Regexp") && $item =~ $site; + return $site eq $item; +} + +sub get_site { + my($sites, $url) = @_; + + my $uri = URI->new($url); + $uri = URI->new("http://$url") unless $uri and $uri->scheme; + return unless $uri and $uri->can("host") and $uri->host and $uri->scheme =~ /^https?$/; + + for my $site(@$sites) { + my $match = 1; + $match &&= _matcher($site->{domain}, $uri->host =~ s/^www\.//ri); + $match &&= _matcher($site->{$_}, $uri->$_) for qw(scheme host path query fragment); + return $site, $uri if $match; + } + + if (Irssi::settings_get_bool("urlinfo_title_unknown")) { + return { name => "unknown", from => "title" }, $uri; + } + + return; +} + +my %from = ( + title => sub { + $_[0]->look_down(_tag => 'title')->as_trimmed_text; + }, + description => sub { + my $el = $_[0]->look_down(_tag => 'meta', name => 'description'); + $el && $el->attr('content'); + }, + 'og:description' => sub { + my $el = $_[0]->look_down(_tag => 'meta', property => 'og:description'); + $el && $el->attr('content'); + }, + 'og:title' => sub { + my $el = $_[0]->look_down(_tag => 'meta', property => 'og:title'); + $el && $el->attr('content'); + }, +); + +sub get_info { + my($site, $uri) = @_; + my $from = $site->{from}; + my $tree = HTML::TreeBuilder->new_from_url($uri); + my $info; + if (!ref $from || ref $from eq 'ARRAY') { + $info = join ": ", grep defined, map $from{$_}->($tree), ref $from ? @$from : $from; + } else { + $info = join "", $tree->as_html =~ $from; + } + $info =~ s/$site->{cleanup}// if $site->{cleanup}; + $info =~ s/([\x00-\x19])/sprintf "\\x%x", ord $1/ger; +} + +# ------- IRC message handling + +# John Gruber's URL regexp (nicely handles people putting URLs in parens, etc) +my $URL_RE = qr{((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))}; + +my $pipe_in_progress; +my @sites; +my $timeout = 10; + +sub msg { + my($server, $text, $nick, undef, $target) = @_; + # TODO: Add a queue / multiple pipe support? + return if $pipe_in_progress; + + my $msg_time = time; + my $tag = $server->{tag}; + my $target = $target || $nick; + + if (my($url) = $text =~ $URL_RE) { + my($site, $uri) = get_site(\@sites, $url); + return unless $site; + return if ignored($uri); + + fork_wrapper(sub { # Child + my($fh) = @_; + syswrite $fh, " " . get_info($site, $uri); + }, + sub { # Parent + my($in) = @_; + if ($in =~ s/^- //) { + print "\x{3}4urlinfo error:\x{3} $in"; + return; + } + $in =~ s/^ //; + return unless $in; + + # Avoid reusing server just in case it is no longer valid + my $server = Irssi::server_find_tag($tag); + my $win = find_window($server, $target); + + my $view = $win->view; + my $line = $view->get_lines; + while ($line && ($line = $line->next)) { + if ($line->{info}->{time} >= $msg_time) { + if ($line->get_text(0) =~ /\Q$url/) { + last; + } + } + } + + my $timestamp = POSIX::strftime( + Irssi::settings_get_str("timestamp_format"), localtime $msg_time); + # I'm sure I shouldn't have to care about colours here... + my $pad = length Irssi::strip_codes($timestamp); + + my $text = $win->format_get_text(__PACKAGE__, $server, $target, + "urlinfo", " " x $pad, $in); + $win->print_after($line, MSGLEVEL_NO_ACT|MSGLEVEL_CLIENTCRAP, + $text, $msg_time); + $view->redraw; + }); + } +} + +sub ignored { + my($uri) = @_; + my @ignored = split / /, Irssi::settings_get_str('urlinfo_ignore_domains'); + my $domain = $uri->host =~ s/^www\.//r; + return grep $domain =~ /^$_$/, @ignored; +} + +sub find_window { + my($server, $target) = @_; + if (my $witem = $server->window_item_find($target)) { + return $witem->window; + } else { + # Maybe they have a msgs window? + my $win = Irssi::window_find_name("(msgs)"); + # Ultimate fallback + $win = Irssi::window_find_refnum(1) unless $win; + return $win; + } +} + +# Based on scriptassist. +sub fork_wrapper { + my($child, $parent) = @_; + + pipe(my $rfh, my $wfh); + + my $pid = fork; + $pipe_in_progress = 1; + + return unless defined $pid; + + if ($pid) { + close $wfh; + Irssi::pidwait_add($pid); + my $pipetag; + my @args = ($rfh, \$pipetag, $parent); + $pipetag = Irssi::input_add(fileno($rfh), Irssi::INPUT_READ, \&pipe_input, \@args); + } else { + eval { + local $SIG{ALRM} = sub { die "Timed out\n" }; + alarm $timeout; + $child->($wfh); + }; + alarm 0; + syswrite $wfh, "- $@" if $@; + POSIX::_exit(1); + } +} + +sub pipe_input { + my ($rfh, $pipetag, $parent) = @{$_[0]}; + my $line = <$rfh>; + close($rfh); + Irssi::input_remove($$pipetag); + $pipe_in_progress = 0; + $parent->($line); +} + +sub setup_changed { + $timeout = Irssi::settings_get_int("urlinfo_timeout"); + + @sites = expand(); + for my $site (split / /, Irssi::settings_get_str("urlinfo_custom_domains")) { + next unless $site; + + my($re, $from) = split /=/, $site; + $from ||= "title"; + my($domain, $path) = split m{/}, $re, 2; + expand_site(\@sites, "custom", { + domain => qr/^$domain$/, + path => defined $path ? qr/^\/$path/ : undef, + from => $from, + }, {}); + } +} + +# ------- Initialization + +if (caller) { + # Irssi specific initialization + require Irssi::TextUI; + + Irssi::settings_add_str($IRSSI{name}, "urlinfo_custom_domains", ""); + Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_domains", ""); + Irssi::settings_add_int($IRSSI{name}, "urlinfo_timeout", $timeout); + Irssi::settings_add_bool($IRSSI{name}, "urlinfo_title_unknown", 0); + + Irssi::signal_add("message irc action" => \&msg); + Irssi::signal_add("message private" => \&msg); + Irssi::signal_add("message public" => \&msg); + + Irssi::signal_add_last("setup changed", \&setup_changed); + setup_changed(); + + Irssi::theme_register([ + 'urlinfo' => '$0 %Kinfo:%n $1', + ]); + +} else { + # Built in test. Run this script outside Irssi to use. + @sites = expand(); + for my $site(@sites) { + next unless $site->{example}; + my($found_site, $uri) = get_site(\@sites, $site->{example}); + if ($found_site != $site) { + die "Got $found_site->{name}, expected $site->{name}"; + } + say "Get $uri"; + my $result = get_info($site, $uri); + say $result; + die "Got $result, expected $site->{expected}" unless $result eq $site->{expected}; + } + say "OK"; +} |
