How I Analyzed the PASS 2010 Summit Submissions

Want to know how I analyzed the Summit session data? I exported the data from the abstract selection system into a tab delimited file. Since I use a Mac at home, I used the tools available to me: PostgreSQL.

I loaded the data using the PostgreSQL COPY command to bulk load the report output and then did some data mojo using PostgreSQL. Most of it was exactly the same as it would be for SQL Server with a few differences. Here’s the code that I used:

SELECT  COUNT(*) AS total_sessions
FROM    abstracts;
GO
-- Yup, I used the GO statement separator. This is a byproduct
-- of the client that I use, not because of PostgreSQL.

SELECT  COUNT(DISTINCT first_name || ' ' || last_name) AS name_count
FROM    abstracts ;
GO

SELECT  job_title, 
        COUNT(job_title) AS jt_count
FROM    ( SELECT  job_title, 
                  first_name, 
                  last_name
          FROM    abstracts
          GROUP BY job_title, 
                  first_name, 
                  last_name
) AS x
GROUP BY job_title
ORDER BY jt_count DESC;
GO

WITH c AS (
  SELECT  job_title, 
          first_name, 
          last_name
  FROM    abstracts
  GROUP BY job_title, 
          first_name, 
          last_name
)
SELECT  SUM(CASE WHEN LOWER(job_title) LIKE '%engineer%' THEN 1 ELSE 0 END) AS engineer_count,
        SUM(CASE WHEN LOWER(job_title) LIKE '%manager%' THEN 1 ELSE 0 END) AS manager_count,
        SUM(CASE WHEN LOWER(job_title) LIKE '%developer%' THEN 1 ELSE 0 END) AS developer_count,
        SUM(CASE WHEN LOWER(job_title) LIKE '%consultant%' THEN 1 ELSE 0 END) AS consultant_count,
        SUM(CASE WHEN LOWER(job_title) LIKE '%business intelligence%' THEN 1 
                 WHEN LOWER(job_title) LIKE '%b.i.%' THEN 1
                 WHEN LOWER(job_title) LIKE '%BI%' THEN 1
                 ELSE 0 END) AS bi_count,
        SUM(CASE WHEN LOWER(job_title) LIKE '%architect%' THEN 1 ELSE 0 END) AS architect_count
FROM    c;
GO

SELECT  COUNT(DISTINCT company) AS company_count 
FROM    abstracts;
GO

SELECT  DISTINCT
        category,
        track,
        COUNT(track) OVER (PARTITION BY category) AS the_count_by_category,
        COUNT(track) OVER (PARTITION BY category, track) AS the_count_by_track
FROM    abstracts
ORDER BY category, track;
GO


SELECT  AVG(author_count) AS author_average
FROM    ( SELECT  DISTINCT 
                  first_name, 
                  last_name, 
                  COUNT(session_title) OVER (PARTITION BY first_name, last_name) AS author_count
          FROM   abstracts
        ) AS x;
GO

-- You might be weirded out by this string_to_array and array_upper.
-- PostgreSQL has an array data type. We're using an array of strings
-- to get an accurate word count
SELECT  AVG(CHAR_LENGTH(abstract)) AS avg_char_count, 
        STDDEV(CHAR_LENGTH(abstract)) AS char_count_stddev,
        AVG(array_upper(string_to_array(abstract, ' '), 1)) AS avg_word_count,
        STDDEV(array_upper(string_to_array(abstract, ' '), 1)) AS word_count_stddev
FROM    abstracts;
GO

-- Hey, look at that, we can use a single TRIM function to do all work
SELECT  SUM(CASE WHEN TRIM(BOTH FROM presented_before) = 'YES' THEN 1 ELSE 0 END) AS presented_before,
        SUM(CASE WHEN TRIM(BOTH FROM presented_before) = 'NO' THEN 1 ELSE 0 END) AS never_presented
FROM    abstracts;
GO

SELECT  level, 
        COUNT(*)
FROM    abstracts
GROUP BY level
ORDER BY level;
GO
Menu