[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [TV] Formatting listings like the radiotimes website



Andrew Flegg wrote:
On Mon, Mar 07, 2005 at 05:43:14AM +0000, Richard Lewis wrote:
Do you plan to add the other ITV regions? (also things like BBC 2
Wales, and there were some other digital terrestrial channels not
available)


Different regions are planned, but I'm only really motivated if their
listings dramatically differ; other channel requests are welcome, but
will depend on them being on a network which already has a parser or
having a decent website with the wanted information on it.

The attached perl script will do that for the ITV channels. It pulls the TV listing off the Guardian newspaper website to create an xml file in the same format as the current bleb.org xml output.
#!/usr/bin/perl

use strict;

use English;
use LWP::Simple qw/$ua get/;
my $useragent = 'BlipVert ITV Component 0.1';
my $from = 'steve@xxxxxxxxxxxxxx';

# guardian tv listings format

# title between 
# <!-- Header -->
# <FONT FACE="Arial,Helvetica,sans-serif" SIZE="5"><B>
# <!-- header in size 5; arial,helvetica,sans-serif -->
# <!-- INSERT CHANNEL NAME -->
# Tyne Tees
# </B></FONT><HR SIZE=1 NOSHADE>
# <!-- End header -->

# each program between 
# <!-- start of TV style --> and
# <!-- end of TV style -->

# adjust useragent settings
$ua->agent($useragent);
$ua->from($from);

#########################################################################
sub convert_time {
    # convert AM/PM type timestr to 24 hour
    my $timestr = shift();
    # ie 10.30AM
    my $noon = $timestr;
    $noon =~ s/\d||\.//g;
    $timestr =~ s/(am)|(pm)//g;
    my ($hour, $min) = split ('\.', $timestr);
    if ($noon eq 'pm') {
	$hour += 12;
    }
    if ($hour == 24) { $hour = '00'; }
    if (length($hour) < 2) { $hour = '0' . $hour; }
    if (length($min) < 2) { $min = '0' . $min; }
    my $time = $hour . $min;
    return $time;
}
#########################################################################
sub process_channel {
    # the main bit that does just about everything
    my %parms = @_;

    # fetch and write html page
    my $content = get($parms{url});
    open CHANNEL, ">/tmp/$PID.0.html";
    print CHANNEL $content;
    close CHANNEL;

    # read html page and prep for output
    open IN, "/tmp/$PID.0.html";
    open OUT, ">$parms{xmlout}";

    $RS = ' of TV style -->';
    my @programs = <IN>;
    shift @programs;
    close(IN);
    unlink "/tmp/$PID.0.html";

    # drop some of the html - leaves bold around time and title
    # so I can split on that later
    foreach my $program (@programs) {
	$program =~ s/(<TD.*)|(<TR>.*)|(<FONT.*)//ig;
	$program =~ s/<!--.*//g;
	$program =~ s/(<\/TD>)|(<\/TR>)//ig;
	$program =~ s/(<\/FONT>)//ig;
	$program =~ s/\s{2,}//g;
	$program =~ s/&.*\s//g;  # sgml character entities confuse xml parser
    }

    # date for xml header
    my @timestr = localtime;
    my $day = $timestr[3];
    my $mon = ++$timestr[4];
    my $year = $timestr[5] + 1900;
    my $date = $day . '/' . $mon . '/' . $year;

    # output XML header
    print OUT '<?xml version="1.0" encoding="UTF-8"?>', "\n";
    print OUT '<channel id="', $parms{name}, '" source="The Guardian" ',
      'date="', $date, '">';

    # remove empty programs
    my @progs;
    foreach my $show (@programs) {
	if (defined($show)) {
	    push @progs, $show;
	}
    }

    # dump show titles and timings
    for (my $i = 0; $i < @programs; ++$i) {
	my $program = $programs[$i];
	my $j = $i + 2;
	my $nextprog =  $programs[$j];
	$program =~ s/<B>//g;
	$nextprog =~ s/<B>//g;
	my ($start, $title, $desc) = split(/<\/B>/, $program);
	($start, $title, $desc) =~ s/^ | $//g;
	my ($nextstart, $nexttitle, $nextdesc) = split(/<\/B>/, $nextprog);
	($nextstart, $nexttitle, $nextdesc) =~ s/^ | $//g;
	$start = convert_time($start);
	$nextstart = convert_time($nextstart);
	if ($start and $title and $desc) {
	    print OUT "
<programme>
  <desc>$desc</desc>
  <title>$title</title>
  <end>$nextstart</end>
  <infourl>Sorry, information not provided by source The Guardian</infourl>
  <start>$start</start>
</programme>
";
	}
    }

    print OUT "</channel>\n";
    close OUT;
}
#########################################################################

process_channel(url => 'http://www.guardian.co.uk/TV/tyne_tyne.html',
		name => 'ITV Tyne Tees',
		xmlout => '/home/steve/public_html/TV/tyne_tees.xml');

# Anglia
# http://www.guardian.co.uk/TV/anglia_anglia.html

# Border
# http://www.guardian.co.uk/TV/border_border.html

# Carlton
# process_channel(url =>  'http://www.guardian.co.uk/TV/carlton_carlton.html',
# 		  name => 'ITV Carlton',
#		  xmlout => '/home/steve/public_html/TV/carlton.xml');

# Central
# http://www.guardian.co.uk/TV/central_central.html

# Channel
# http://www.guardian.co.uk/TV/channel_channel.html

# Grampian
# http://www.guardian.co.uk/TV/grampian_grampian.html

# Granada
# http://www.guardian.co.uk/TV/granada_granada.html

# HTV Wales
# http://www.guardian.co.uk/TV/htvwales_htv_wales.html

# HTV West
# http://www.guardian.co.uk/TV/htvwest_htv_west.html

# Meridian
# http://www.guardian.co.uk/TV/meridian_meridian.html

# Tyne Tees
# http://www.guardian.co.uk/TV/tyne_tyne.html

# Scottish
# http://www.guardian.co.uk/TV/scottish_scottish.html

# Ulster
# http://www.guardian.co.uk/TV/ulster_ulster.html

# West Country
# http://www.guardian.co.uk/TV/westcountry_westcountry.html

# Yorkshire
# http://www.guardian.co.uk/TV/ytv_ytv.html

# South East
# http://www.guardian.co.uk/TV/meridian_southeast.html

# London
# http://www.guardian.co.uk/TV/carlton_london.html

# Scotland
# http://www.guardian.co.uk/TV/bordern_border_north.html