The query used to capture GitHub repositories.
SELECT
max(concat(f.repo_name, ' ', f.path)) as repo_path,
c.content
FROM `bigquery-public-data.github_repos.files` as f
JOIN `bigquery-public-data.github_repos.contents` as c on f.id = c.id
JOIN (
--this part of the query makes sure repo is watched at least twice since 2017
SELECT repo FROM(
SELECT
repo.name as repo
FROM `githubarchive.year.2017` WHERE type="WatchEvent"
UNION ALL
SELECT
repo.name as repo
FROM `githubarchive.month.2019*` WHERE type="WatchEvent"
)
GROUP BY 1
HAVING COUNT(*) >= 2
) as r on f.repo_name = r.repo
WHERE
(f.path like '%.py' or f.path like '*.ipynb') and --with python extension
c.size < 15000 and --get rid of ridiculously long files
REGEXP_CONTAINS(c.content, r'def ') --contains function definition
group by c.content